Rework how integers are tokenizedal/int-tokens

author: Andrea Leopardi <an.leopardi@gmail.com> 2017-08-22 01:05:30 +0200
committer: Andrea Leopardi <an.leopardi@gmail.com> 2017-08-22 01:05:30 +0200
commit: 269c9f25e32a2e28118ba4e6a606d8ac3c8237d5 (patch)
tree: 6a5a7cb8ba9ce6e4e3ebee338556af518013790e
parent: 1dd4c16958663be472493044d0b6d84aa42b08d7 (diff)
download: elixir-al/int-tokens.tar.gz
5 files changed, 68 insertions, 55 deletions
diff --git a/lib/elixir/src/elixir_parser.yrl b/lib/elixir/src/elixir_parser.yrl
index 151668299..9aa7b9d5f 100644
--- a/lib/elixir/src/elixir_parser.yrl
+++ b/lib/elixir/src/elixir_parser.yrl
@@ -40,7 +40,7 @@ Terminals
   capture_op rel_op
   'true' 'false' 'nil' 'do' eol ';' ',' '.'
   '(' ')' '[' ']' '{' '}' '<<' '>>' '%{}' '%'
-  binary octal decimal float hex
+  base_integer decimal float
   .
 
 Rootsymbol grammar.
@@ -237,7 +237,7 @@ no_parens_zero_expr -> dot_identifier : build_identifier('$1', nil).
 %% marks identifiers followed by brackets as bracket_identifier.
 access_expr -> bracket_at_expr : '$1'.
 access_expr -> bracket_expr : '$1'.
-access_expr -> capture_op_eol decimal : build_unary_op('$1', ?exprs('$2')).
+access_expr -> capture_op_eol decimal : build_unary_op('$1', parse_integer_literal(?exprs('$2'))).
 access_expr -> fn_eoe stab end_eoe : build_fn('$1', reverse('$2')).
 access_expr -> open_paren stab close_paren : build_stab(reverse('$2')).
 access_expr -> open_paren stab ';' close_paren : build_stab(reverse('$2')).
@@ -262,10 +262,8 @@ access_expr -> max_expr : '$1'.
 
 %% Augment integer literals with representation format if wrap_literals_in_blocks option is true
 number -> char : handle_literal(?exprs('$1'), '$1', [{format, char}]).
-number -> binary : handle_literal(?exprs('$1'), '$1', [{format, binary}]).
-number -> octal : handle_literal(?exprs('$1'), '$1', [{format, octal}]).
-number -> decimal : handle_literal(?exprs('$1'), '$1', [{format, decimal}]).
-number -> hex : handle_literal(?exprs('$1'), '$1', [{format, hex}]).
+number -> decimal : handle_literal(parse_integer_literal(?exprs('$1')), '$1', [{original, ?exprs('$1')}]).
+number -> base_integer : handle_literal(parse_integer_literal(?exprs('$1')), '$1', [{original, ?exprs('$1')}]).
 number -> float : handle_literal(?exprs('$1'), '$1').
 
 %% Aliases and properly formed calls. Used by map_expr.
@@ -637,6 +635,15 @@ handle_literal(Literal, Token, ExtraMeta) ->
     false -> Literal
   end.
 
+parse_integer_literal([$0, $x | Rest]) ->
+  list_to_integer(Rest, 16);
+parse_integer_literal([$0, $o | Rest]) ->
+  list_to_integer(Rest, 8);
+parse_integer_literal([$0, $b | Rest]) ->
+  list_to_integer(Rest, 2);
+parse_integer_literal(Decimal) ->
+  list_to_integer(Decimal, 10).
+
 %% Operators
 
 build_op({_Kind, Location, 'in'}, {UOp, _, [Left]}, Right) when ?rearrange_uop(UOp) ->
diff --git a/lib/elixir/src/elixir_tokenizer.erl b/lib/elixir/src/elixir_tokenizer.erl
index 99f42836e..4f7c90ff2 100644
--- a/lib/elixir/src/elixir_tokenizer.erl
+++ b/lib/elixir/src/elixir_tokenizer.erl
@@ -166,15 +166,15 @@ tokenize(("<<<<<<<" ++ _) = Original, Line, 1, _Scope, Tokens) ->
 
 tokenize([$0, $x, H | T], Line, Column, Scope, Tokens) when ?is_hex(H) ->
   {Rest, Number, Length} = tokenize_hex(T, [H], 1),
-  tokenize(Rest, Line, Column + 2 + Length, Scope, [{hex, {Line, Column, Column + 2 + Length}, Number} | Tokens]);
+  tokenize(Rest, Line, Column + 2 + Length, Scope, [{base_integer, {Line, Column, Column + 2 + Length}, Number} | Tokens]);
 
 tokenize([$0, $b, H | T], Line, Column, Scope, Tokens) when ?is_bin(H) ->
   {Rest, Number, Length} = tokenize_bin(T, [H], 1),
-  tokenize(Rest, Line, Column + 2 + Length, Scope, [{binary, {Line, Column, Column + 2 + Length}, Number} | Tokens]);
+  tokenize(Rest, Line, Column + 2 + Length, Scope, [{base_integer, {Line, Column, Column + 2 + Length}, Number} | Tokens]);
 
 tokenize([$0, $o, H | T], Line, Column, Scope, Tokens) when ?is_octal(H) ->
   {Rest, Number, Length} = tokenize_octal(T, [H], 1),
-  tokenize(Rest, Line, Column + 2 + Length, Scope, [{octal, {Line, Column, Column + 2 + Length}, Number} | Tokens]);
+  tokenize(Rest, Line, Column + 2 + Length, Scope, [{base_integer, {Line, Column, Column + 2 + Length}, Number} | Tokens]);
 
 % Comments
 
@@ -420,7 +420,7 @@ tokenize([H | T], Line, Column, Scope, Tokens) when ?is_digit(H) ->
   case tokenize_number(T, [H], 1, false) of
     {error, Reason, Number} ->
       {error, {Line, Reason, Number}, T, Tokens};
-    {Rest, Number, Length} when is_integer(Number) ->
+    {Rest, Number, Length} when is_list(Number) ->
       tokenize(Rest, Line, Column + Length, Scope, [{decimal, {Line, Column, Column + Length}, Number} | Tokens]);
     {Rest, Number, Length} ->
       tokenize(Rest, Line, Column + Length, Scope, [{float, {Line, Column, Column + Length}, Number} | Tokens])
@@ -831,28 +831,28 @@ tokenize_number(Rest, Acc, Length, true) ->
 
 %% Or integer.
 tokenize_number(Rest, Acc, Length, false) ->
-  {Rest, list_to_integer(lists:reverse(Acc)), Length}.
+  {Rest, lists:reverse(Acc), Length}.
 
 tokenize_hex([H | T], Acc, Length) when ?is_hex(H) ->
   tokenize_hex(T, [H | Acc], Length + 1);
 tokenize_hex([$_, H | T], Acc, Length) when ?is_hex(H) ->
   tokenize_hex(T, [H | Acc], Length + 2);
 tokenize_hex(Rest, Acc, Length) ->
-  {Rest, list_to_integer(lists:reverse(Acc), 16), Length}.
+  {Rest, [$0, $x | lists:reverse(Acc)], Length}.
 
 tokenize_octal([H | T], Acc, Length) when ?is_octal(H) ->
   tokenize_octal(T, [H | Acc], Length + 1);
 tokenize_octal([$_, H | T], Acc, Length) when ?is_octal(H) ->
   tokenize_octal(T, [H | Acc], Length + 2);
 tokenize_octal(Rest, Acc, Length) ->
-  {Rest, list_to_integer(lists:reverse(Acc), 8), Length}.
+  {Rest, [$0, $o | lists:reverse(Acc)], Length}.
 
 tokenize_bin([H | T], Acc, Length) when ?is_bin(H) ->
   tokenize_bin(T, [H | Acc], Length + 1);
 tokenize_bin([$_, H | T], Acc, Length) when ?is_bin(H) ->
   tokenize_bin(T, [H | Acc], Length + 2);
 tokenize_bin(Rest, Acc, Length) ->
-  {Rest, list_to_integer(lists:reverse(Acc), 2), Length}.
+  {Rest, [$0, $b | lists:reverse(Acc)], Length}.
 
 %% Comments
 
diff --git a/lib/elixir/test/elixir/code_test.exs b/lib/elixir/test/elixir/code_test.exs
index 6fc40983a..4e0b24c4a 100644
--- a/lib/elixir/test/elixir/code_test.exs
+++ b/lib/elixir/test/elixir/code_test.exs
@@ -97,7 +97,7 @@ defmodule CodeTest do
 
   test "string_to_quoted/1" do
     assert Code.string_to_quoted("1 + 2") == {:ok, {:+, [line: 1], [1, 2]}}
-    assert Code.string_to_quoted("a.1") == {:error, {1, "syntax error before: ", "1"}}
+    assert Code.string_to_quoted("a.1") == {:error, {1, "syntax error before: ", "\"1\""}}
   end
 
   test "string_to_quoted/1 for presence of sigils terminators" do
@@ -133,15 +133,15 @@ defmodule CodeTest do
     assert Code.string_to_quoted("\"one\"", wrap_literals_in_blocks: true) == {:ok, {:__block__, [line: 1], ["one"]}}
     assert Code.string_to_quoted("\"one\"") == {:ok, "one"}
     assert Code.string_to_quoted("?é", wrap_literals_in_blocks: true) == {:ok, {:__block__, [format: :char, line: 1], [233]}}
-    assert Code.string_to_quoted("0b10", wrap_literals_in_blocks: true) == {:ok, {:__block__, [format: :binary, line: 1], [2]}}
-    assert Code.string_to_quoted("12", wrap_literals_in_blocks: true) == {:ok, {:__block__, [format: :decimal, line: 1], [12]}}
-    assert Code.string_to_quoted("0o123", wrap_literals_in_blocks: true) == {:ok, {:__block__, [format: :octal, line: 1], [83]}}
-    assert Code.string_to_quoted("0xEF", wrap_literals_in_blocks: true) == {:ok, {:__block__, [format: :hex, line: 1], [239]}}
+    assert Code.string_to_quoted("0b10", wrap_literals_in_blocks: true) == {:ok, {:__block__, [original: '0b10', line: 1], [2]}}
+    assert Code.string_to_quoted("12", wrap_literals_in_blocks: true) == {:ok, {:__block__, [original: '12', line: 1], [12]}}
+    assert Code.string_to_quoted("0o123", wrap_literals_in_blocks: true) == {:ok, {:__block__, [original: '0o123', line: 1], [83]}}
+    assert Code.string_to_quoted("0xEF", wrap_literals_in_blocks: true) == {:ok, {:__block__, [original: '0xEF', line: 1], [239]}}
     assert Code.string_to_quoted("12.3", wrap_literals_in_blocks: true) == {:ok, {:__block__, [line: 1], [12.3]}}
     assert Code.string_to_quoted("nil", wrap_literals_in_blocks: true) == {:ok, {:__block__, [line: 1], [nil]}}
     assert Code.string_to_quoted(":one", wrap_literals_in_blocks: true) == {:ok, {:__block__, [line: 1], [:one]}}
     assert Code.string_to_quoted("[1]", wrap_literals_in_blocks: true) ==
-           {:ok, {:__block__, [line: 1], [[{:__block__, [format: :decimal, line: 1], [1]}]]}}
+           {:ok, {:__block__, [line: 1], [[{:__block__, [original: '1', line: 1], [1]}]]}}
     assert Code.string_to_quoted("{:ok, :test}", wrap_literals_in_blocks: true) ==
            {:ok, {:__block__, [line: 1], [{{:__block__, [line: 1], [:ok]}, {:__block__, [line: 1], [:test]}}]}}
     assert Code.string_to_quoted("\"\"\"\nhello\n\"\"\"", wrap_literals_in_blocks: true)
diff --git a/lib/elixir/test/erlang/string_test.erl b/lib/elixir/test/erlang/string_test.erl
index 8ec706bf6..e55fcf835 100644
--- a/lib/elixir/test/erlang/string_test.erl
+++ b/lib/elixir/test/erlang/string_test.erl
@@ -41,12 +41,12 @@ extract_interpolations_with_only_two_interpolations_test() ->
 
 extract_interpolations_with_tuple_inside_interpolation_test() ->
   [<<"f">>,
-   {{1, 2, 8}, [{'{', {1, 4, 5}}, {decimal, {1, 5, 6}, 1}, {'}', {1, 6, 7}}]},
+   {{1, 2, 8}, [{'{', {1, 4, 5}}, {decimal, {1, 5, 6}, "1"}, {'}', {1, 6, 7}}]},
    <<"o">>] = extract_interpolations("f#{{1}}o").
 
 extract_interpolations_with_many_expressions_inside_interpolation_test() ->
   [<<"f">>,
-   {{1, 2, 3}, [{decimal, {1, 4, 5}, 1}, {eol, {1, 5, 6}}, {decimal, {2, 1, 2}, 2}]},
+   {{1, 2, 3}, [{decimal, {1, 4, 5}, "1"}, {eol, {1, 5, 6}}, {decimal, {2, 1, 2}, "2"}]},
     <<"o">>] = extract_interpolations("f#{1\n2}o").
 
 extract_interpolations_with_right_curly_inside_string_inside_interpolation_test() ->
@@ -66,7 +66,7 @@ extract_interpolations_with_escaped_quote_inside_string_inside_interpolation_tes
 
 extract_interpolations_with_less_than_operation_inside_interpolation_test() ->
   [<<"f">>,
-   {{1, 2, 8}, [{decimal, {1, 4, 5}, 1}, {rel_op, {1, 5, 6}, '<'}, {decimal, {1, 6, 7}, 2}]},
+   {{1, 2, 8}, [{decimal, {1, 4, 5}, "1"}, {rel_op, {1, 5, 6}, '<'}, {decimal, {1, 6, 7}, "2"}]},
    <<"o">>] = extract_interpolations("f#{1<2}o").
 
 extract_interpolations_with_an_escaped_character_test() ->
diff --git a/lib/elixir/test/erlang/tokenizer_test.erl b/lib/elixir/test/erlang/tokenizer_test.erl
index da8c97474..b94017090 100644
--- a/lib/elixir/test/erlang/tokenizer_test.erl
+++ b/lib/elixir/test/erlang/tokenizer_test.erl
@@ -13,16 +13,20 @@ tokenize_error(String) ->
   Error.
 
 type_test() ->
-  [{decimal, {1, 1, 2}, 1}, {type_op, {1, 3, 5}, '::'}, {decimal, {1, 6, 7}, 3}] = tokenize("1 :: 3"),
+  [{decimal, {1, 1, 2}, "1"}, {type_op, {1, 3, 5}, '::'}, {decimal, {1, 6, 7}, "3"}] = tokenize("1 :: 3"),
   [{identifier, {1, 1, 5}, name},
    {'.', {1, 5, 6}},
    {paren_identifier, {1, 6, 8}, '::'},
    {'(', {1, 8, 9}},
-   {decimal, {1, 9, 10}, 3},
+   {decimal, {1, 9, 10}, "3"},
    {')', {1, 10, 11}}] = tokenize("name.::(3)").
 
 arithmetic_test() ->
-  [{decimal, {1, 1, 2}, 1}, {dual_op, {1, 3, 4}, '+'}, {decimal, {1, 5, 6}, 2}, {dual_op, {1, 7, 8}, '+'}, {decimal, {1, 9, 10}, 3}] = tokenize("1 + 2 + 3").
+  [{decimal, {1, 1, 2}, "1"},
+   {dual_op, {1, 3, 4}, '+'},
+   {decimal, {1, 5, 6}, "2"},
+   {dual_op, {1, 7, 8}, '+'},
+   {decimal, {1, 9, 10}, "3"}] = tokenize("1 + 2 + 3").
 
 op_kw_test() ->
   [{atom, {1, 1, 5}, foo}, {dual_op, {1, 5, 6}, '+'}, {atom, {1, 6, 10}, bar}] = tokenize(":foo+:bar").
@@ -33,12 +37,12 @@ scientific_test() ->
   {1, "invalid float number ", "1.0e309"} = tokenize_error("1.0e309").
 
 hex_bin_octal_test() ->
-  [{hex, {1, 1, 5}, 255}] = tokenize("0xFF"),
-  [{hex, {1, 1, 6}, 255}] = tokenize("0xF_F"),
-  [{octal, {1, 1, 5}, 63}] = tokenize("0o77"),
-  [{octal, {1, 1, 6}, 63}] = tokenize("0o7_7"),
-  [{binary, {1, 1, 5}, 3}] = tokenize("0b11"),
-  [{binary, {1, 1, 6}, 3}] = tokenize("0b1_1").
+  [{base_integer, {1, 1, 5}, "0xFF"}] = tokenize("0xFF"),
+  [{base_integer, {1, 1, 6}, "0xFF"}] = tokenize("0xF_F"),
+  [{base_integer, {1, 1, 5}, "0o77"}] = tokenize("0o77"),
+  [{base_integer, {1, 1, 6}, "0o77"}] = tokenize("0o7_7"),
+  [{base_integer, {1, 1, 5}, "0b11"}] = tokenize("0b11"),
+  [{base_integer, {1, 1, 6}, "0b11"}] = tokenize("0b1_1").
 
 unquoted_atom_test() ->
   [{atom, {1, 1, 3}, '+'}] = tokenize(":+"),
@@ -68,10 +72,10 @@ kw_test() ->
   [{kw_identifier_unsafe, {1, 1, 10}, [<<"foo bar">>]}] = tokenize("\"foo bar\": ").
 
 integer_test() ->
-  [{decimal, {1, 1, 4}, 123}] = tokenize("123"),
-  [{decimal, {1, 1, 4}, 123}, {';', {1, 4, 5}}] = tokenize("123;"),
-  [{eol, {1, 1, 2}}, {decimal, {3, 1, 4}, 123}] = tokenize("\n\n123"),
-  [{decimal, {1, 3, 6}, 123}, {decimal, {1, 8, 11}, 234}] = tokenize("  123  234  ").
+  [{decimal, {1, 1, 4}, "123"}] = tokenize("123"),
+  [{decimal, {1, 1, 4}, "123"}, {';', {1, 4, 5}}] = tokenize("123;"),
+  [{eol, {1, 1, 2}}, {decimal, {3, 1, 4}, "123"}] = tokenize("\n\n123"),
+  [{decimal, {1, 3, 6}, "123"}, {decimal, {1, 8, 11}, "234"}] = tokenize("  123  234  ").
 
 float_test() ->
   [{float, {1, 1, 5}, 12.3}] = tokenize("12.3"),
@@ -82,9 +86,11 @@ float_test() ->
   {1, "invalid float number ", OversizedFloat} = tokenize_error(OversizedFloat).
 
 comments_test() ->
-  [{decimal, {1, 1, 2}, 1}, {eol, {1, 3, 4}}, {decimal, {2, 1, 2}, 2}] = tokenize("1 # Comment\n2"),
-  [{decimal, {1, 1, 2}, 1}, {comment, {1, 3, 12}, "# Comment"},
-   {eol, {1, 12, 13}}, {decimal, {2, 1, 2}, 2}] = tokenize("1 # Comment\n2", [{preserve_comments, true}]),
+  [{decimal, {1, 1, 2}, "1"}, {eol, {1, 3, 4}}, {decimal, {2, 1, 2}, "2"}] = tokenize("1 # Comment\n2"),
+  [{decimal, {1, 1, 2}, "1"},
+   {comment, {1, 3, 12}, "# Comment"},
+   {eol, {1, 12, 13}},
+   {decimal, {2, 1, 2}, "2"}] = tokenize("1 # Comment\n2", [{preserve_comments, true}]),
   [{comment, {1, 1, 10}, "# Comment"}] = tokenize("# Comment", [{preserve_comments, true}]).
 
 identifier_test() ->
@@ -118,24 +124,24 @@ newline_test() ->
   [{identifier, {1, 1, 4}, foo},
    {'.', {2, 1, 2}},
    {identifier, {2, 2, 5}, bar}]  = tokenize("foo\n.bar"),
-  [{decimal, {1, 1, 2}, 1},
+  [{decimal, {1, 1, 2}, "1"},
    {two_op, {2, 1, 3}, '++'},
-   {decimal, {2, 3, 4}, 2}]  = tokenize("1\n++2").
+   {decimal, {2, 3, 4}, "2"}]  = tokenize("1\n++2").
 
 dot_newline_operator_test() ->
   [{identifier, {1, 1, 4}, foo},
    {'.', {1, 4, 5}},
    {identifier, {2, 1, 2}, '+'},
-   {decimal, {2, 2, 3}, 1}] = tokenize("foo.\n+1"),
+   {decimal, {2, 2, 3}, "1"}] = tokenize("foo.\n+1"),
   [{identifier, {1, 1, 4}, foo},
    {'.', {1, 4, 5}},
    {identifier, {2, 1, 2}, '+'},
-   {decimal, {2, 2, 3}, 1}] = tokenize("foo.#bar\n+1"),
+   {decimal, {2, 2, 3}, "1"}] = tokenize("foo.#bar\n+1"),
   [{identifier, {1, 1, 4}, foo},
    {'.', {1, 4, 5}},
    {comment, {1, 5, 9}, "#bar"},
    {identifier, {2, 1, 2}, '+'},
-   {decimal, {2, 2, 3}, 1}] = tokenize("foo.#bar\n+1", [{preserve_comments, true}]).
+   {decimal, {2, 2, 3}, "1"}] = tokenize("foo.#bar\n+1", [{preserve_comments, true}]).
 
 aliases_test() ->
   [{'aliases', {1, 1, 4}, ['Foo']}] = tokenize("Foo"),
@@ -158,8 +164,8 @@ addadd_test() ->
   [{identifier, {1, 1, 2}, x}, {two_op, {1, 3, 5}, '++'}, {identifier, {1, 6, 7}, y}] = tokenize("x ++ y").
 
 space_test() ->
-  [{op_identifier, {1, 1, 4}, foo}, {dual_op, {1, 5, 6}, '-'}, {decimal, {1, 6, 7}, 2}] = tokenize("foo -2"),
-  [{op_identifier, {1, 1, 4}, foo}, {dual_op, {1, 6, 7}, '-'}, {decimal, {1, 7, 8}, 2}] = tokenize("foo  -2").
+  [{op_identifier, {1, 1, 4}, foo}, {dual_op, {1, 5, 6}, '-'}, {decimal, {1, 6, 7}, "2"}] = tokenize("foo -2"),
+  [{op_identifier, {1, 1, 4}, foo}, {dual_op, {1, 6, 7}, '-'}, {decimal, {1, 7, 8}, "2"}] = tokenize("foo  -2").
 
 chars_test() ->
   [{char, {1, 1, 3}, 97}] = tokenize("?a"),
@@ -178,17 +184,17 @@ interpolation_test() ->
 capture_test() ->
   [{capture_op, {1, 1, 2}, '&'},
    {identifier, {1, 2, 4}, '||'},
-   {mult_op,    {1, 4, 5}, '/'},
-   {decimal,     {1, 5, 6}, 2}] = tokenize("&||/2"),
+   {mult_op, {1, 4, 5}, '/'},
+   {decimal, {1, 5, 6}, "2"}] = tokenize("&||/2"),
   [{capture_op, {1, 1, 2}, '&'},
    {identifier, {1, 2, 4}, 'or'},
-   {mult_op,    {1, 4, 5}, '/'},
-   {decimal,     {1, 5, 6}, 2}] = tokenize("&or/2"),
-  [{capture_op,{1,1,2},'&'},
-   {unary_op,{1,2,5},'not'},
-   {decimal,{1,6,7},1},
-   {',',{1,7,8}},
-   {decimal,{1,9,10},2}] = tokenize("&not 1, 2").
+   {mult_op, {1, 4, 5}, '/'},
+   {decimal, {1, 5, 6}, "2"}] = tokenize("&or/2"),
+  [{capture_op, {1, 1, 2}, '&'},
+   {unary_op, {1, 2, 5}, 'not'},
+   {decimal, {1, 6, 7}, "1"},
+   {',', {1, 7, 8}},
+   {decimal, {1 ,9, 10}, "2"}] = tokenize("&not 1, 2").
 
 vc_merge_conflict_test() ->
   {1, "found an unexpected version control marker, please resolve the conflicts: ", "<<<<<<< HEAD"} =
author	Andrea Leopardi <an.leopardi@gmail.com>	2017-08-22 01:05:30 +0200
committer	Andrea Leopardi <an.leopardi@gmail.com>	2017-08-22 01:05:30 +0200
commit	269c9f25e32a2e28118ba4e6a606d8ac3c8237d5 (patch)
tree	6a5a7cb8ba9ce6e4e3ebee338556af518013790e
parent	1dd4c16958663be472493044d0b6d84aa42b08d7 (diff)
download	elixir-al/int-tokens.tar.gz