-module(elixir_tokenizer). -include("elixir.hrl"). -export([tokenize/1, tokenize/3, tokenize/4, invalid_do_error/1]). %% Numbers -define(is_hex(S), (?is_digit(S) orelse (S >= $A andalso S =< $F) orelse (S >= $a andalso S =< $f))). -define(is_bin(S), (S >= $0 andalso S =< $1)). -define(is_octal(S), (S >= $0 andalso S =< $7)). %% Digits and letters -define(is_digit(S), (S >= $0 andalso S =< $9)). -define(is_upcase(S), (S >= $A andalso S =< $Z)). -define(is_downcase(S), (S >= $a andalso S =< $z)). %% Others -define(is_quote(S), (S == $" orelse S == $')). -define(is_sigil(S), ((S == $/) orelse (S == $<) orelse (S == $") orelse (S == $') orelse (S == $[) orelse (S == $() orelse (S == ${) orelse (S == $|))). %% Spaces -define(is_horizontal_space(S), ((S == $\s) orelse (S == $\t))). -define(is_vertical_space(S), ((S == $\r) orelse (S == $\n))). -define(is_space(S), (?is_horizontal_space(S) orelse ?is_vertical_space(S))). %% Operators -define(at_op(T), T == $@). -define(capture_op(T), T == $&). -define(unary_op(T), T == $!; T == $^). -define(unary_op3(T1, T2, T3), T1 == $~, T2 == $~, T3 == $~). -define(list_op(T1, T2), T1 == $+, T2 == $+; T1 == $-, T2 == $-). -define(two_op(T1, T2), T1 == $<, T2 == $>; T1 == $., T2 == $.). -define(three_op(T1, T2, T3), T1 == $^, T2 == $^, T3 == $^). -define(mult_op(T), T == $* orelse T == $/). -define(dual_op(T), T == $+ orelse T == $-). -define(arrow_op3(T1, T2, T3), T1 == $<, T2 == $<, T3 == $<; T1 == $>, T2 == $>, T3 == $>; T1 == $~, T2 == $>, T3 == $>; T1 == $<, T2 == $<, T3 == $~; T1 == $<, T2 == $~, T3 == $>; T1 == $<, T2 == $|, T3 == $>). -define(arrow_op(T1, T2), T1 == $|, T2 == $>; T1 == $~, T2 == $>; T1 == $<, T2 == $~). -define(rel_op(T), T == $<; T == $>). -define(rel_op2(T1, T2), T1 == $<, T2 == $=; T1 == $>, T2 == $=). -define(comp_op2(T1, T2), T1 == $=, T2 == $=; T1 == $=, T2 == $~; T1 == $!, T2 == $=). -define(comp_op3(T1, T2, T3), T1 == $=, T2 == $=, T3 == $=; T1 == $!, T2 == $=, T3 == $=). -define(and_op(T1, T2), T1 == $&, T2 == $&). -define(or_op(T1, T2), T1 == $|, T2 == $|). -define(and_op3(T1, T2, T3), T1 == $&, T2 == $&, T3 == $&). -define(or_op3(T1, T2, T3), T1 == $|, T2 == $|, T3 == $|). -define(match_op(T), T == $=). -define(in_match_op(T1, T2), T1 == $<, T2 == $-; T1 == $\\, T2 == $\\). -define(stab_op(T1, T2), T1 == $-, T2 == $>). -define(type_op(T1, T2), T1 == $:, T2 == $:). -define(pipe_op(T), T == $|). tokenize(String, Line, Column, #elixir_tokenizer{} = Scope) -> tokenize(String, Line, Column, Scope, []); tokenize(String, Line, Column, Opts) -> IdentifierTokenizer = elixir_config:get(identifier_tokenizer, 'Elixir.String.Tokenizer'), Scope = lists:foldl(fun ({file, File}, Acc) when is_binary(File) -> Acc#elixir_tokenizer{file=File}; ({existing_atoms_only, ExistingAtomsOnly}, Acc) when is_boolean(ExistingAtomsOnly) -> Acc#elixir_tokenizer{existing_atoms_only=ExistingAtomsOnly}; ({check_terminators, CheckTerminators}, Acc) when is_boolean(CheckTerminators) -> Acc#elixir_tokenizer{check_terminators=CheckTerminators}; ({preserve_comments, PreserveComments}, Acc) when is_function(PreserveComments) -> Acc#elixir_tokenizer{preserve_comments=PreserveComments}; ({unescape, Unescape}, Acc) when is_boolean(Unescape) -> Acc#elixir_tokenizer{unescape=Unescape}; ({warn_on_unnecessary_quotes, Unnecessary}, Acc) when is_boolean(Unnecessary) -> Acc#elixir_tokenizer{warn_on_unnecessary_quotes=Unnecessary}; (_, Acc) -> Acc end, #elixir_tokenizer{identifier_tokenizer=IdentifierTokenizer}, Opts), tokenize(String, Line, Column, Scope, []). tokenize(String, Line, Opts) -> tokenize(String, Line, 1, Opts). tokenize([], _Line, _Column, #elixir_tokenizer{terminators=[]}, Tokens) -> {ok, lists:reverse(Tokens)}; tokenize([], EndLine, Column, Scope, Tokens) -> #elixir_tokenizer{terminators=[{Start, StartLine, _} | _]} = Scope, End = terminator(Start), Hint = missing_terminator_hint(Start, End, Scope), Message = io_lib:format("missing terminator: ~ts (for \"~ts\" starting at line ~B)", [End, Start, StartLine]), {error, {EndLine, Column, [Message, Hint], []}, [], Tokens}; % VC merge conflict tokenize(("<<<<<<<" ++ _) = Original, Line, 1, _Scope, Tokens) -> FirstLine = lists:takewhile(fun(C) -> C =/= $\n andalso C =/= $\r end, Original), {error, {Line, 1, "found an unexpected version control marker, please resolve the conflicts: ", FirstLine}, Original, Tokens}; % Base integers tokenize([$0, $x, H | T], Line, Column, Scope, Tokens) when ?is_hex(H) -> {Rest, Number, OriginalRepresentation, Length} = tokenize_hex(T, [H], 1), Token = {int, {Line, Column, Number}, OriginalRepresentation}, tokenize(Rest, Line, Column + 2 + Length, Scope, [Token | Tokens]); tokenize([$0, $b, H | T], Line, Column, Scope, Tokens) when ?is_bin(H) -> {Rest, Number, OriginalRepresentation, Length} = tokenize_bin(T, [H], 1), Token = {int, {Line, Column, Number}, OriginalRepresentation}, tokenize(Rest, Line, Column + 2 + Length, Scope, [Token | Tokens]); tokenize([$0, $o, H | T], Line, Column, Scope, Tokens) when ?is_octal(H) -> {Rest, Number, OriginalRepresentation, Length} = tokenize_octal(T, [H], 1), Token = {int, {Line, Column, Number}, OriginalRepresentation}, tokenize(Rest, Line, Column + 2 + Length, Scope, [Token | Tokens]); % Comments tokenize([$# | String], Line, Column, Scope, Tokens) -> {Rest, Comment} = tokenize_comment(String, [$#]), preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), tokenize(Rest, Line, Column, Scope, reset_eol(Tokens)); % Sigils tokenize([$~, S, H, H, H | T] = Original, Line, Column, Scope, Tokens) when ?is_quote(H), ?is_upcase(S) orelse ?is_downcase(S) -> case extract_heredoc_with_interpolation(Line, Column, Scope, ?is_downcase(S), T, H) of {ok, NewLine, NewColumn, Parts, Rest} -> {Final, Modifiers} = collect_modifiers(Rest, []), Token = {sigil, {Line, Column, nil}, S, Parts, Modifiers, <>}, NewColumnWithModifiers = NewColumn + length(Modifiers), tokenize(Final, NewLine, NewColumnWithModifiers, Scope, [Token | Tokens]); {error, Reason} -> {error, Reason, Original, Tokens} end; tokenize([$~, S, H | T] = Original, Line, Column, Scope, Tokens) when ?is_sigil(H), ?is_upcase(S) orelse ?is_downcase(S) -> case elixir_interpolation:extract(Line, Column + 3, Scope, ?is_downcase(S), T, sigil_terminator(H)) of {NewLine, NewColumn, Parts, Rest} -> {Final, Modifiers} = collect_modifiers(Rest, []), Token = {sigil, {Line, Column, nil}, S, tokens_to_binary(Parts), Modifiers, <>}, NewColumnWithModifiers = NewColumn + length(Modifiers), tokenize(Final, NewLine, NewColumnWithModifiers, Scope, [Token | Tokens]); {error, Reason} -> Sigil = [$~, S, H], interpolation_error(Reason, Original, Tokens, " (for sigil ~ts starting at line ~B)", [Sigil, Line]) end; tokenize([$~, S, H | _] = Original, Line, Column, _Scope, Tokens) when ?is_upcase(S) orelse ?is_downcase(S) -> MessageString = "\"~ts\" (column ~p, codepoint U+~4.16.0B). The available delimiters are: " "//, ||, \"\", '', (), [], {}, <>", Message = io_lib:format(MessageString, [[H], Column + 2, H]), {error, {Line, Column, "invalid sigil delimiter: ", Message}, Original, Tokens}; % Char tokens % We tokenize char literals (?a) as {char, _, CharInt} instead of {number, _, % CharInt}. This is exactly what Erlang does with Erlang char literals % ($a). This means we'll have to adjust the error message for char literals in % elixir_errors.erl as by default {char, _, _} tokens are "hijacked" by Erlang % and printed with Erlang syntax ($a) in the parser's error messages. tokenize([$?, $\\, H | T], Line, Column, Scope, Tokens) -> Char = elixir_interpolation:unescape_map(H), Token = {char, {Line, Column, [$?, $\\, H]}, Char}, tokenize(T, Line, Column + 3, Scope, [Token | Tokens]); tokenize([$?, Char | T], Line, Column, Scope, Tokens) -> case handle_char(Char) of {Escape, Name} -> Msg = io_lib:format("found ? followed by codepoint 0x~.16B (~ts), please use ?~ts instead", [Char, Name, Escape]), elixir_errors:warn(Line, Scope#elixir_tokenizer.file, Msg); false -> ok end, Token = {char, {Line, Column, [$?, Char]}, Char}, tokenize(T, Line, Column + 2, Scope, [Token | Tokens]); % Heredocs tokenize("\"\"\"" ++ T, Line, Column, Scope, Tokens) -> handle_heredocs(T, Line, Column, $", Scope, Tokens); tokenize("'''" ++ T, Line, Column, Scope, Tokens) -> handle_heredocs(T, Line, Column, $', Scope, Tokens); % Strings tokenize([$" | T], Line, Column, Scope, Tokens) -> handle_strings(T, Line, Column + 1, $", Scope, Tokens); tokenize([$' | T], Line, Column, Scope, Tokens) -> handle_strings(T, Line, Column + 1, $', Scope, Tokens); % Operator atoms tokenize("...:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) -> tokenize(Rest, Line, Column + 4, Scope, [{kw_identifier, {Line, Column, nil}, '...'} | Tokens]); tokenize("<<>>:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) -> tokenize(Rest, Line, Column + 5, Scope, [{kw_identifier, {Line, Column, nil}, '<<>>'} | Tokens]); tokenize("%{}:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) -> tokenize(Rest, Line, Column + 4, Scope, [{kw_identifier, {Line, Column, nil}, '%{}'} | Tokens]); tokenize("%:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) -> tokenize(Rest, Line, Column + 2, Scope, [{kw_identifier, {Line, Column, nil}, '%'} | Tokens]); tokenize("{}:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) -> tokenize(Rest, Line, Column + 3, Scope, [{kw_identifier, {Line, Column, nil}, '{}'} | Tokens]); tokenize(":..." ++ Rest, Line, Column, Scope, Tokens) -> tokenize(Rest, Line, Column + 4, Scope, [{atom, {Line, Column, nil}, '...'} | Tokens]); tokenize(":<<>>" ++ Rest, Line, Column, Scope, Tokens) -> tokenize(Rest, Line, Column + 5, Scope, [{atom, {Line, Column, nil}, '<<>>'} | Tokens]); tokenize(":%{}" ++ Rest, Line, Column, Scope, Tokens) -> tokenize(Rest, Line, Column + 4, Scope, [{atom, {Line, Column, nil}, '%{}'} | Tokens]); tokenize(":%" ++ Rest, Line, Column, Scope, Tokens) -> tokenize(Rest, Line, Column + 2, Scope, [{atom, {Line, Column, nil}, '%'} | Tokens]); tokenize(":{}" ++ Rest, Line, Column, Scope, Tokens) -> tokenize(Rest, Line, Column + 3, Scope, [{atom, {Line, Column, nil}, '{}'} | Tokens]); % ## Three Token Operators tokenize([$:, T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?unary_op3(T1, T2, T3); ?comp_op3(T1, T2, T3); ?and_op3(T1, T2, T3); ?or_op3(T1, T2, T3); ?arrow_op3(T1, T2, T3); ?three_op(T1, T2, T3) -> Token = {atom, {Line, Column, nil}, list_to_atom([T1, T2, T3])}, tokenize(Rest, Line, Column + 4, Scope, [Token | Tokens]); % ## Two Token Operators tokenize([$:, T1, T2 | Rest], Line, Column, Scope, Tokens) when ?comp_op2(T1, T2); ?rel_op2(T1, T2); ?and_op(T1, T2); ?or_op(T1, T2); ?arrow_op(T1, T2); ?in_match_op(T1, T2); ?two_op(T1, T2); ?list_op(T1, T2); ?stab_op(T1, T2); ?type_op(T1, T2) -> Token = {atom, {Line, Column, nil}, list_to_atom([T1, T2])}, tokenize(Rest, Line, Column + 3, Scope, [Token | Tokens]); % ## Single Token Operators tokenize([$:, T | Rest], Line, Column, Scope, Tokens) when ?at_op(T); ?unary_op(T); ?capture_op(T); ?dual_op(T); ?mult_op(T); ?rel_op(T); ?match_op(T); ?pipe_op(T); T == $. -> Token = {atom, {Line, Column, nil}, list_to_atom([T])}, tokenize(Rest, Line, Column + 2, Scope, [Token | Tokens]); % Stand-alone tokens tokenize("..." ++ Rest, Line, Column, Scope, Tokens) -> maybe_warn_too_many_of_same_char("...", Rest, Line, Scope), Token = check_call_identifier(Line, Column, '...', Rest), tokenize(Rest, Line, Column + 3, Scope, [Token | Tokens]); tokenize("=>" ++ Rest, Line, Column, Scope, Tokens) -> Token = {assoc_op, {Line, Column, previous_was_eol(Tokens)}, '=>'}, tokenize(Rest, Line, Column + 2, Scope, add_token_with_eol(Token, Tokens)); % ## Three token operators tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?unary_op3(T1, T2, T3) -> handle_unary_op(Rest, Line, Column, unary_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens); tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?comp_op3(T1, T2, T3) -> handle_op(Rest, Line, Column, comp_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens); tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?and_op3(T1, T2, T3) -> maybe_warn_too_many_of_same_char([T1, T2, T3], Rest, Line, Scope), handle_op(Rest, Line, Column, and_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens); tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?or_op3(T1, T2, T3) -> maybe_warn_too_many_of_same_char([T1, T2, T3], Rest, Line, Scope), handle_op(Rest, Line, Column, or_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens); tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?three_op(T1, T2, T3) -> maybe_warn_too_many_of_same_char([T1, T2, T3], Rest, Line, Scope), handle_op(Rest, Line, Column, three_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens); tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?arrow_op3(T1, T2, T3) -> handle_op(Rest, Line, Column, arrow_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens); % ## Containers + punctuation tokens tokenize([$, | Rest], Line, Column, Scope, Tokens) -> Token = {',', {Line, Column, 0}}, tokenize(Rest, Line, Column + 1, Scope, [Token | Tokens]); tokenize([$<, $< | Rest], Line, Column, Scope, Tokens) -> Token = {'<<', {Line, Column, nil}}, handle_terminator(Rest, Line, Column + 2, Scope, Token, Tokens); tokenize([$>, $> | Rest], Line, Column, Scope, Tokens) -> Token = {'>>', {Line, Column, previous_was_eol(Tokens)}}, handle_terminator(Rest, Line, Column + 2, Scope, Token, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when T == $(; T == ${; T == $[ -> Token = {list_to_atom([T]), {Line, Column, nil}}, handle_terminator(Rest, Line, Column + 1, Scope, Token, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when T == $); T == $}; T == $] -> Token = {list_to_atom([T]), {Line, Column, previous_was_eol(Tokens)}}, handle_terminator(Rest, Line, Column + 1, Scope, Token, Tokens); % ## Two Token Operators tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?two_op(T1, T2) -> handle_op(Rest, Line, Column, two_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?list_op(T1, T2) -> maybe_warn_too_many_of_same_char([T1, T2], Rest, Line, Scope), handle_op(Rest, Line, Column, two_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?arrow_op(T1, T2) -> handle_op(Rest, Line, Column, arrow_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?comp_op2(T1, T2) -> handle_op(Rest, Line, Column, comp_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?rel_op2(T1, T2) -> handle_op(Rest, Line, Column, rel_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?and_op(T1, T2) -> handle_op(Rest, Line, Column, and_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?or_op(T1, T2) -> handle_op(Rest, Line, Column, or_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?in_match_op(T1, T2) -> handle_op(Rest, Line, Column, in_match_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?type_op(T1, T2) -> handle_op(Rest, Line, Column, type_op, 2, list_to_atom([T1, T2]), Scope, Tokens); tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?stab_op(T1, T2) -> handle_op(Rest, Line, Column, stab_op, 2, list_to_atom([T1, T2]), Scope, Tokens); % ## Single Token Operators tokenize([T | Rest], Line, Column, Scope, Tokens) when ?at_op(T) -> handle_unary_op(Rest, Line, Column, at_op, 1, list_to_atom([T]), Scope, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when ?capture_op(T) -> handle_unary_op(Rest, Line, Column, capture_op, 1, list_to_atom([T]), Scope, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when ?unary_op(T) -> handle_unary_op(Rest, Line, Column, unary_op, 1, list_to_atom([T]), Scope, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when ?rel_op(T) -> handle_op(Rest, Line, Column, rel_op, 1, list_to_atom([T]), Scope, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when ?dual_op(T) -> handle_unary_op(Rest, Line, Column, dual_op, 1, list_to_atom([T]), Scope, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when ?mult_op(T) -> handle_op(Rest, Line, Column, mult_op, 1, list_to_atom([T]), Scope, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when ?match_op(T) -> handle_op(Rest, Line, Column, match_op, 1, list_to_atom([T]), Scope, Tokens); tokenize([T | Rest], Line, Column, Scope, Tokens) when ?pipe_op(T) -> handle_op(Rest, Line, Column, pipe_op, 1, list_to_atom([T]), Scope, Tokens); % Non-operator Atoms tokenize([$:, H | T] = Original, Line, Column, Scope, Tokens) when ?is_quote(H) -> case elixir_interpolation:extract(Line, Column + 2, Scope, true, T, H) of {NewLine, NewColumn, Parts, Rest} -> case is_unnecessary_quote(Parts, Scope) of true -> elixir_errors:warn(Line, Scope#elixir_tokenizer.file, io_lib:format( "found quoted atom \"~ts\" but the quotes are not required. " "Quotes should only be used to introduce atoms with foreign characters in them", [hd(Parts)] )); false -> ok end, case unescape_tokens(Parts, Scope) of {ok, [Part]} when is_binary(Part) -> case unsafe_to_atom(Part, Line, Column, Scope) of {ok, Atom} -> Token = {atom, {Line, Column, nil}, Atom}, tokenize(Rest, NewLine, NewColumn, Scope, [Token | Tokens]); {error, Reason} -> {error, Reason, Rest, Tokens} end; {ok, Unescaped} -> Key = case Scope#elixir_tokenizer.existing_atoms_only of true -> atom_safe; false -> atom_unsafe end, Token = {Key, {Line, Column, nil}, Unescaped}, tokenize(Rest, NewLine, NewColumn, Scope, [Token | Tokens]); {error, Msg} -> {error, {Line, Column, Msg, [$:, H]}, Rest, Tokens} end; {error, Reason} -> interpolation_error(Reason, Original, Tokens, " (for atom starting at line ~B)", [Line]) end; tokenize([$: | String] = Original, Line, Column, Scope, Tokens) -> case tokenize_identifier(String, Line, Column, Scope) of {_Kind, Atom, Rest, Length, _Ascii, _Special} -> maybe_warn_for_ambiguous_bang_before_equals(atom, Atom, Rest, Scope, Line), Token = {atom, {Line, Column, nil}, Atom}, tokenize(Rest, Line, Column + 1 + Length, Scope, [Token | Tokens]); empty -> unexpected_token(Original, Line, Column, Tokens); {error, Reason} -> {error, Reason, Original, Tokens} end; % Integers and floats tokenize([H | T], Line, Column, Scope, Tokens) when ?is_digit(H) -> case tokenize_number(T, [H], 1, false) of {error, Reason, Number} -> {error, {Line, Column, Reason, Number}, T, Tokens}; {Rest, Number, Original, Length} when is_integer(Number) -> Token = {int, {Line, Column, Number}, Original}, tokenize(Rest, Line, Column + Length, Scope, [Token | Tokens]); {Rest, Number, Original, Length} -> Token = {float, {Line, Column, Number}, Original}, tokenize(Rest, Line, Column + Length, Scope, [Token | Tokens]) end; % Spaces tokenize([T | Rest], Line, Column, Scope, Tokens) when ?is_horizontal_space(T) -> {Remaining, Stripped} = strip_horizontal_space(Rest, 0), handle_space_sensitive_tokens(Remaining, Line, Column + 1 + Stripped, Scope, Tokens); % End of line tokenize(";" ++ Rest, Line, Column, Scope, []) -> tokenize(Rest, Line, Column + 1, Scope, [{';', {Line, Column, 0}}]); tokenize(";" ++ Rest, Line, Column, Scope, [Top | _] = Tokens) when element(1, Top) /= ';' -> tokenize(Rest, Line, Column + 1, Scope, [{';', {Line, Column, 0}} | Tokens]); tokenize("\\" = Original, Line, Column, _Scope, Tokens) -> {error, {Line, Column, "invalid escape \\ at end of file", []}, Original, Tokens}; tokenize("\\\n" = Original, Line, Column, _Scope, Tokens) -> {error, {Line, Column, "invalid escape \\ at end of file", []}, Original, Tokens}; tokenize("\\\r\n" = Original, Line, Column, _Scope, Tokens) -> {error, {Line, Column, "invalid escape \\ at end of file", []}, Original, Tokens}; tokenize("\\\n" ++ Rest, Line, _Column, Scope, Tokens) -> tokenize_eol(Rest, Line, Scope, Tokens); tokenize("\\\r\n" ++ Rest, Line, _Column, Scope, Tokens) -> tokenize_eol(Rest, Line, Scope, Tokens); tokenize("\n" ++ Rest, Line, Column, Scope, Tokens) -> tokenize_eol(Rest, Line, Scope, eol(Line, Column, Tokens)); tokenize("\r\n" ++ Rest, Line, Column, Scope, Tokens) -> tokenize_eol(Rest, Line, Scope, eol(Line, Column, Tokens)); % Others tokenize([$%, $[ | Rest], Line, Column, _Scope, Tokens) -> Reason = {Line, Column, "expected %{ to define a map, got: ", [$%, $[]}, {error, Reason, Rest, Tokens}; tokenize([$%, ${ | T], Line, Column, Scope, Tokens) -> tokenize([${ | T], Line, Column + 1, Scope, [{'%{}', {Line, Column, nil}} | Tokens]); tokenize([$% | T], Line, Column, Scope, Tokens) -> tokenize(T, Line, Column + 1, Scope, [{'%', {Line, Column, nil}} | Tokens]); tokenize([$. | T], Line, Column, Scope, Tokens) -> DotInfo = {Line, Column, nil}, {Rest, EndLine, EndColumn} = strip_dot_space(T, Line, Column + 1, [{'.', DotInfo}| Tokens], Scope), handle_dot([$. | Rest], EndLine, EndColumn, DotInfo, Scope, Tokens); % Identifiers tokenize(String, Line, Column, Scope, Tokens) -> case tokenize_identifier(String, Line, Column, Scope) of {Kind, Atom, Rest, Length, Ascii, Special} -> HasAt = lists:member($@, Special), case Rest of [$: | T] when ?is_space(hd(T)) -> Token = {kw_identifier, {Line, Column, nil}, Atom}, tokenize(T, Line, Column + Length + 1, Scope, [Token | Tokens]); [$: | T] when hd(T) /= $: -> AtomName = atom_to_list(Atom) ++ [$:], Reason = {Line, Column, "keyword argument must be followed by space after: ", AtomName}, {error, Reason, String, Tokens}; _ when HasAt -> Reason = {Line, Column, invalid_character_error(Kind, $@), atom_to_list(Atom)}, {error, Reason, String, Tokens}; _ when Kind == alias -> tokenize_alias(Rest, Line, Column, Atom, Length, Ascii, Special, Scope, Tokens); _ when Kind == identifier -> maybe_warn_for_ambiguous_bang_before_equals(identifier, Atom, Rest, Scope, Line), tokenize_other(Rest, Line, Column, Atom, Length, Scope, Tokens); _ -> unexpected_token(String, Line, Column, Tokens) end; empty -> unexpected_token(String, Line, Column, Tokens); {error, Reason} -> {error, Reason, String, Tokens} end. unexpected_token([T | Rest], Line, Column, Tokens) -> Message = io_lib:format("\"~ts\" (column ~p, codepoint U+~4.16.0B)", [[T], Column, T]), {error, {Line, Column, "unexpected token: ", Message}, Rest, Tokens}. tokenize_eol(Rest, Line, Scope, Tokens) -> {StrippedRest, Indentation} = strip_horizontal_space(Rest, 0), IndentedScope = Scope#elixir_tokenizer{indentation=Indentation}, tokenize(StrippedRest, Line + 1, Indentation + 1, IndentedScope, Tokens). strip_horizontal_space([H | T], Counter) when ?is_horizontal_space(H) -> strip_horizontal_space(T, Counter + 1); strip_horizontal_space(T, Counter) -> {T, Counter}. strip_dot_space(T, Line, Column, Tokens, Scope) -> case strip_horizontal_space(T, 0) of {"#" ++ R, _} -> {Rest, Comment} = tokenize_comment(R, [$#]), preserve_comments(Line, Column, Tokens, Comment, Rest, Scope), strip_dot_space(Rest, Line, 1, reset_eol(Tokens), Scope); {"\r\n" ++ Rest, _} -> strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope); {"\n" ++ Rest, _} -> strip_dot_space(Rest, Line + 1, 1, eol(Line, Column, Tokens), Scope); {Rest, Length} -> {Rest, Line, Column + Length} end. handle_char(7) -> {"\\a", "alert"}; handle_char($\b) -> {"\\b", "backspace"}; handle_char($\d) -> {"\\d", "delete"}; handle_char($\e) -> {"\\e", "escape"}; handle_char($\f) -> {"\\f", "form feed"}; handle_char($\n) -> {"\\n", "newline"}; handle_char($\r) -> {"\\r", "carriage return"}; handle_char($\s) -> {"\\s", "space"}; handle_char($\t) -> {"\\t", "tab"}; handle_char($\v) -> {"\\v", "vertical tab"}; handle_char(_) -> false. %% Handlers handle_heredocs(T, Line, Column, H, Scope, Tokens) -> case extract_heredoc_with_interpolation(Line, Column, Scope, true, T, H) of {ok, NewLine, NewColumn, Parts, Rest} -> case unescape_tokens(Parts, Scope) of {ok, Unescaped} -> Token = {heredoc_type(H), {Line, Column, nil}, Unescaped}, tokenize(Rest, NewLine, NewColumn, Scope, [Token | Tokens]); {error, Msg} -> {error, {Line, Column, Msg, [H, H, H]}, Rest, Tokens} end; {error, Reason} -> {error, Reason, [H, H, H] ++ T, Tokens} end. handle_strings(T, Line, Column, H, Scope, Tokens) -> case elixir_interpolation:extract(Line, Column, Scope, true, T, H) of {error, Reason} -> interpolation_error(Reason, [H | T], Tokens, " (for string starting at line ~B)", [Line]); {NewLine, NewColumn, Parts, [$: | Rest]} when ?is_space(hd(Rest)) -> case is_unnecessary_quote(Parts, Scope) of true -> elixir_errors:warn(Line, Scope#elixir_tokenizer.file, io_lib:format( "found quoted keyword \"~ts\" but the quotes are not required. " "Note that keywords are always atoms, even when quoted, and quotes " "should only be used to introduce keywords with foreign characters in them", [hd(Parts)] )); false -> ok end, case unescape_tokens(Parts, Scope) of {ok, Unescaped} -> Key = case Scope#elixir_tokenizer.existing_atoms_only of true -> kw_identifier_safe; false -> kw_identifier_unsafe end, Token = {Key, {Line, Column - 1, nil}, Unescaped}, tokenize(Rest, NewLine, NewColumn + 1, Scope, [Token | Tokens]); {error, Msg} -> {error, {Line, Column, Msg, [H]}, Rest, Tokens} end; {NewLine, NewColumn, Parts, Rest} -> case unescape_tokens(Parts, Scope) of {ok, Unescaped} -> Token = {string_type(H), {Line, Column - 1, nil}, Unescaped}, tokenize(Rest, NewLine, NewColumn, Scope, [Token | Tokens]); {error, Msg} -> {error, {Line, Column, Msg, [H]}, Rest, Tokens} end end. handle_unary_op([$: | Rest], Line, Column, _Kind, Length, Op, Scope, Tokens) when ?is_space(hd(Rest)) -> Token = {kw_identifier, {Line, Column, nil}, Op}, tokenize(Rest, Line, Column + Length + 1, Scope, [Token | Tokens]); handle_unary_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) -> case strip_horizontal_space(Rest, 0) of {[$/ | _] = Remaining, Extra} -> Token = {identifier, {Line, Column, nil}, Op}, tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens]); {Remaining, Extra} -> Token = {Kind, {Line, Column, nil}, Op}, tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens]) end. handle_op([$: | Rest], Line, Column, _Kind, Length, Op, Scope, Tokens) when ?is_space(hd(Rest)) -> Token = {kw_identifier, {Line, Column, nil}, Op}, tokenize(Rest, Line, Column + Length + 1, Scope, [Token | Tokens]); handle_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) -> case strip_horizontal_space(Rest, 0) of {[$/ | _] = Remaining, Extra} -> Token = {identifier, {Line, Column, nil}, Op}, tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens]); {Remaining, Extra} -> Token = {Kind, {Line, Column, previous_was_eol(Tokens)}, Op}, tokenize(Remaining, Line, Column + Length + Extra, Scope, add_token_with_eol(Token, Tokens)) end. % ## Three Token Operators handle_dot([$., T1, T2, T3 | Rest], Line, Column, DotInfo, Scope, Tokens) when ?unary_op3(T1, T2, T3); ?comp_op3(T1, T2, T3); ?and_op3(T1, T2, T3); ?or_op3(T1, T2, T3); ?arrow_op3(T1, T2, T3); ?three_op(T1, T2, T3) -> handle_call_identifier(Rest, Line, Column, DotInfo, 3, list_to_atom([T1, T2, T3]), Scope, Tokens); % ## Two Token Operators handle_dot([$., T1, T2 | Rest], Line, Column, DotInfo, Scope, Tokens) when ?comp_op2(T1, T2); ?rel_op2(T1, T2); ?and_op(T1, T2); ?or_op(T1, T2); ?arrow_op(T1, T2); ?in_match_op(T1, T2); ?two_op(T1, T2); ?list_op(T1, T2); ?type_op(T1, T2) -> handle_call_identifier(Rest, Line, Column, DotInfo, 2, list_to_atom([T1, T2]), Scope, Tokens); % ## Single Token Operators handle_dot([$., T | Rest], Line, Column, DotInfo, Scope, Tokens) when ?at_op(T); ?unary_op(T); ?capture_op(T); ?dual_op(T); ?mult_op(T); ?rel_op(T); ?match_op(T); ?pipe_op(T) -> handle_call_identifier(Rest, Line, Column, DotInfo, 1, list_to_atom([T]), Scope, Tokens); % ## Exception for .( as it needs to be treated specially in the parser handle_dot([$., $( | Rest], Line, Column, DotInfo, Scope, Tokens) -> TokensSoFar = add_token_with_eol({dot_call_op, DotInfo, '.'}, Tokens), tokenize([$( | Rest], Line, Column + 2, Scope, TokensSoFar); handle_dot([$., H | T] = Original, Line, Column, DotInfo, Scope, Tokens) when ?is_quote(H) -> case elixir_interpolation:extract(Line, Column + 1, Scope, true, T, H) of {NewLine, NewColumn, [Part], Rest} when is_list(Part) -> case is_unnecessary_quote([Part], Scope) of true -> elixir_errors:warn(Line, Scope#elixir_tokenizer.file, io_lib:format( "found quoted call \"~ts\" but the quotes are not required. " "Quotes should only be used to perform calls with foreign characters in them", [Part] )); false -> ok end, case unsafe_to_atom(Part, Line, Column, Scope) of {ok, Atom} -> Token = check_call_identifier(Line, Column, Atom, Rest), TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens), tokenize(Rest, NewLine, NewColumn, Scope, [Token | TokensSoFar]); {error, Reason} -> {error, Reason, Original, Tokens} end; {_NewLine, _NewColumn, _Parts, Rest} -> {error, {Line, Column, "interpolation is not allowed when invoking functions", [H]}, Rest, Tokens}; {error, Reason} -> interpolation_error(Reason, Original, Tokens, " (for function name starting at line ~B)", [Line]) end; handle_dot([$. | Rest], Line, Column, DotInfo, Scope, Tokens) -> TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens), tokenize(Rest, Line, Column, Scope, TokensSoFar). handle_call_identifier(Rest, Line, Column, DotInfo, Length, Op, Scope, Tokens) -> Token = check_call_identifier(Line, Column, Op, Rest), TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens), tokenize(Rest, Line, Column + Length, Scope, [Token | TokensSoFar]). % ## Ambiguous unary/binary operators tokens handle_space_sensitive_tokens([Sign, NotMarker | T], Line, Column, Scope, [{Identifier, _, _} = H | Tokens]) when ?dual_op(Sign), not(?is_space(NotMarker)), NotMarker /= $(, NotMarker /= $[, NotMarker /= $<, NotMarker /= ${, %% containers NotMarker /= $%, NotMarker /= $+, NotMarker /= $-, NotMarker /= $/, NotMarker /= $>, %% operators Identifier == identifier -> Rest = [NotMarker | T], DualOpToken = {dual_op, {Line, Column, nil}, list_to_atom([Sign])}, tokenize(Rest, Line, Column + 1, Scope, [DualOpToken, setelement(1, H, op_identifier) | Tokens]); handle_space_sensitive_tokens(String, Line, Column, Scope, Tokens) -> tokenize(String, Line, Column, Scope, Tokens). %% Helpers eol(_Line, _Column, [{',', {Line, Column, Count}} | Tokens]) -> [{',', {Line, Column, Count + 1}} | Tokens]; eol(_Line, _Column, [{';', {Line, Column, Count}} | Tokens]) -> [{';', {Line, Column, Count + 1}} | Tokens]; eol(_Line, _Column, [{eol, {Line, Column, Count}} | Tokens]) -> [{eol, {Line, Column, Count + 1}} | Tokens]; eol(Line, Column, Tokens) -> [{eol, {Line, Column, 1}} | Tokens]. is_unnecessary_quote([Part], #elixir_tokenizer{warn_on_unnecessary_quotes=true} = Scope) when is_list(Part) -> case (Scope#elixir_tokenizer.identifier_tokenizer):tokenize(Part) of {identifier, _, [], _, _, _} -> true; _ -> false end; is_unnecessary_quote(_Parts, _Scope) -> false. unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{}) when is_binary(Part) andalso byte_size(Part) > 255; is_list(Part) andalso length(Part) > 255 -> {error, {Line, Column, "atom length must be less than system limit: ", elixir_utils:characters_to_list(Part)}}; unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_binary(Binary) -> try {ok, binary_to_existing_atom(Binary, utf8)} catch error:badarg -> {error, {Line, Column, "unsafe atom does not exist: ", elixir_utils:characters_to_list(Binary)}} end; unsafe_to_atom(Binary, _Line, _Column, #elixir_tokenizer{}) when is_binary(Binary) -> {ok, binary_to_atom(Binary, utf8)}; unsafe_to_atom(List, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_list(List) -> try {ok, list_to_existing_atom(List)} catch error:badarg -> {error, {Line, Column, "unsafe atom does not exist: ", List}} end; unsafe_to_atom(List, _Line, _Column, #elixir_tokenizer{}) when is_list(List) -> {ok, list_to_atom(List)}. collect_modifiers([H | T], Buffer) when ?is_downcase(H) or ?is_upcase(H) -> collect_modifiers(T, [H | Buffer]); collect_modifiers(Rest, Buffer) -> {Rest, lists:reverse(Buffer)}. %% Heredocs extract_heredoc_with_interpolation(Line, Column, Scope, Interpol, T, H) -> case extract_heredoc(Line, Column, T, H, Scope) of {ok, NewLine, NewColumn, Body, Rest} -> case elixir_interpolation:extract(Line + 1, 1, Scope, Interpol, Body, 0) of {error, Reason} -> {error, interpolation_format(Reason, " (for heredoc starting at line ~B)", [Line])}; {_, _, Parts, []} -> {ok, NewLine, NewColumn, tokens_to_binary(Parts), Rest} end; {error, _} = Error -> Error end. extract_heredoc(Line0, Column0, Rest0, Marker, Scope) -> case extract_heredoc_header(Rest0) of {ok, Rest1} -> %% We prepend a new line so we can transparently remove %% spaces later. This new line is removed by calling "tl" %% in the final heredoc body three lines below. case extract_heredoc_body(Line0, Column0, Marker, [$\n | Rest1], []) of {ok, Line1, Body, Rest2, Spaces} -> {ok, Line1, 4 + Spaces, tl(remove_heredoc_spaces(Body, Spaces, Marker, Scope)), Rest2}; {error, Reason, ErrorLine, ErrorColumn} -> Terminator = [Marker, Marker, Marker], {Message, Token} = heredoc_error_message(Reason, Line0, Terminator), {error, {ErrorLine, ErrorColumn, Message, Token}} end; error -> Message = "heredoc start must be followed by a new line after ", {error, {Line0, Column0, io_lib:format(Message, []), [Marker, Marker, Marker]}} end. heredoc_error_message(eof, Line, Terminator) -> {io_lib:format("missing terminator: ~ts (for heredoc starting at line ~B)", [Terminator, Line]), []}; heredoc_error_message(badterminator, _Line, Terminator) -> {"invalid location for heredoc terminator, please escape token or move it to its own line: ", Terminator}. %% Remove spaces from heredoc based on the position of the final quotes. remove_heredoc_spaces(Body, Spaces, Marker, Scope) -> case trim_spaces(Body, [0], Spaces, false) of {Acc, false} -> Acc; {Acc, Line} -> Msg = io_lib:format("outdented heredoc line. The contents inside the heredoc should be indented " "at the same level as the closing ~ts. The following is forbidden:~n~n" " def text do~n" " \"\"\"~n" " contents~n" " \"\"\"~n" " end~n~n" "Instead make sure the contents are indented as much as the heredoc closing:~n~n" " def text do~n" " \"\"\"~n" " contents~n" " \"\"\"~n" " end~n~n" "The current heredoc line is indented too little", [[Marker, Marker, Marker]]), elixir_errors:warn(Line, Scope#elixir_tokenizer.file, Msg), Acc end. trim_spaces([{Line, Entry} | Rest], Acc, Spaces, Warned) -> case trim_space(lists:reverse(Entry), Spaces) of {Trimmed, true} when Warned == false -> trim_spaces(Rest, Trimmed ++ Acc, Spaces, Line); {Trimmed, _} -> trim_spaces(Rest, Trimmed ++ Acc, Spaces, Warned) end; trim_spaces([], Acc, _Spaces, Warned) -> {Acc, Warned}. trim_space(Rest, 0) -> {Rest, false}; trim_space([$\n], _) -> {[$\n], false}; trim_space([H | T], Spaces) when ?is_horizontal_space(H) -> trim_space(T, Spaces - 1); trim_space(Rest, _Spaces) -> {Rest, true}. %% Extract the heredoc header. extract_heredoc_header("\r\n" ++ Rest) -> {ok, Rest}; extract_heredoc_header("\n" ++ Rest) -> {ok, Rest}; extract_heredoc_header([H | T]) when ?is_horizontal_space(H) -> extract_heredoc_header(T); extract_heredoc_header(_) -> error. %% Extract heredoc body. It returns the heredoc body (in reverse order), %% the remaining of the document and the number of spaces the heredoc %% is aligned. extract_heredoc_body(Line, Column, Marker, Rest, Buffer) -> case extract_heredoc_line(Marker, Rest, [], 0) of {ok, Entry, NewRest} -> extract_heredoc_body(Line + 1, 1, Marker, NewRest, [{Line, Entry} | Buffer]); {done, Entry, NewRest, Spaces} -> {ok, Line, [{Line, Entry} | Buffer], NewRest, Spaces}; {error, Reason} -> {error, Reason, Line, Column} end. %% Extract a line from the heredoc prepending its contents to a buffer. %% Allow lazy escaping (e.g. \""") extract_heredoc_line(Marker, [$\\, $\\ | T], Buffer) -> extract_heredoc_line(Marker, T, [$\\, $\\ | Buffer]); extract_heredoc_line(Marker, [$\\, Marker | T], Buffer) -> extract_heredoc_line(Marker, T, [Marker, $\\ | Buffer]); extract_heredoc_line(Marker, [Marker, Marker, Marker | _], _) -> {error, badterminator}; extract_heredoc_line(_, "\r\n" ++ Rest, Buffer) -> {ok, [$\n | Buffer], Rest}; extract_heredoc_line(_, "\n" ++ Rest, Buffer) -> {ok, [$\n | Buffer], Rest}; extract_heredoc_line(Marker, [H | T], Buffer) -> extract_heredoc_line(Marker, T, [H | Buffer]); extract_heredoc_line(_, _, _) -> {error, eof}. %% Extract each heredoc line trying to find a match according to the marker. extract_heredoc_line(Marker, [H | T], Buffer, Counter) when ?is_horizontal_space(H) -> extract_heredoc_line(Marker, T, [H | Buffer], Counter + 1); extract_heredoc_line(Marker, [Marker, Marker, Marker | T], Buffer, Counter) -> {done, Buffer, T, Counter}; extract_heredoc_line(Marker, Rest, Buffer, _Counter) -> extract_heredoc_line(Marker, Rest, Buffer). unescape_tokens(Tokens, #elixir_tokenizer{unescape=true}) -> elixir_interpolation:unescape_tokens(Tokens); unescape_tokens(Tokens, #elixir_tokenizer{unescape=false}) -> {ok, tokens_to_binary(Tokens)}. tokens_to_binary(Tokens) -> [if is_list(Token) -> elixir_utils:characters_to_binary(Token); true -> Token end || Token <- Tokens]. %% Integers and floats %% At this point, we are at least sure the first digit is a number. %% Check if we have a point followed by a number; tokenize_number([$., H | T], Acc, Length, false) when ?is_digit(H) -> tokenize_number(T, [H, $. | Acc], Length + 2, true); %% Check if we have an underscore followed by a number; tokenize_number([$_, H | T], Acc, Length, Bool) when ?is_digit(H) -> tokenize_number(T, [H, $_ | Acc], Length + 2, Bool); %% Check if we have e- followed by numbers (valid only for floats); tokenize_number([E, S, H | T], Acc, Length, true) when (E == $E) or (E == $e), ?is_digit(H), S == $+ orelse S == $- -> tokenize_number(T, [H, S, E | Acc], Length + 3, true); %% Check if we have e followed by numbers (valid only for floats); tokenize_number([E, H | T], Acc, Length, true) when (E == $E) or (E == $e), ?is_digit(H) -> tokenize_number(T, [H, E | Acc], Length + 2, true); %% Finally just numbers. tokenize_number([H | T], Acc, Length, Bool) when ?is_digit(H) -> tokenize_number(T, [H | Acc], Length + 1, Bool); %% Cast to float... tokenize_number(Rest, Acc, Length, true) -> try {Number, Original} = reverse_number(Acc, [], []), {Rest, list_to_float(Number), Original, Length} catch error:badarg -> {error, "invalid float number ", lists:reverse(Acc)} end; %% Or integer. tokenize_number(Rest, Acc, Length, false) -> {Number, Original} = reverse_number(Acc, [], []), {Rest, list_to_integer(Number), Original, Length}. tokenize_hex([H | T], Acc, Length) when ?is_hex(H) -> tokenize_hex(T, [H | Acc], Length + 1); tokenize_hex([$_, H | T], Acc, Length) when ?is_hex(H) -> tokenize_hex(T, [H, $_ | Acc], Length + 2); tokenize_hex(Rest, Acc, Length) -> {Number, Original} = reverse_number(Acc, [], []), {Rest, list_to_integer(Number, 16), [$0, $x | Original], Length}. tokenize_octal([H | T], Acc, Length) when ?is_octal(H) -> tokenize_octal(T, [H | Acc], Length + 1); tokenize_octal([$_, H | T], Acc, Length) when ?is_octal(H) -> tokenize_octal(T, [H, $_ | Acc], Length + 2); tokenize_octal(Rest, Acc, Length) -> {Number, Original} = reverse_number(Acc, [], []), {Rest, list_to_integer(Number, 8), [$0, $o | Original], Length}. tokenize_bin([H | T], Acc, Length) when ?is_bin(H) -> tokenize_bin(T, [H | Acc], Length + 1); tokenize_bin([$_, H | T], Acc, Length) when ?is_bin(H) -> tokenize_bin(T, [H, $_ | Acc], Length + 2); tokenize_bin(Rest, Acc, Length) -> {Number, Original} = reverse_number(Acc, [], []), {Rest, list_to_integer(Number, 2), [$0, $b | Original], Length}. reverse_number([$_ | T], Number, Original) -> reverse_number(T, Number, [$_ | Original]); reverse_number([H | T], Number, Original) -> reverse_number(T, [H | Number], [H | Original]); reverse_number([], Number, Original) -> {Number, Original}. %% Comments reset_eol([{eol, {Line, Column, _}} | Rest]) -> [{eol, {Line, Column, 0}} | Rest]; reset_eol(Rest) -> Rest. tokenize_comment("\r\n" ++ _ = Rest, Acc) -> {Rest, lists:reverse(Acc)}; tokenize_comment("\n" ++ _ = Rest, Acc) -> {Rest, lists:reverse(Acc)}; tokenize_comment([H | Rest], Acc) -> tokenize_comment(Rest, [H | Acc]); tokenize_comment([], Acc) -> {[], lists:reverse(Acc)}. preserve_comments(Line, Column, Tokens, Comment, Rest, Scope) -> case Scope#elixir_tokenizer.preserve_comments of Fun when is_function(Fun) -> Fun(Line, Column, Tokens, Comment, Rest); nil -> ok end. %% Identifiers tokenize([H | T]) when ?is_upcase(H) -> {Acc, Rest, Length, Special} = tokenize_continue(T, [H], 1, []), {alias, lists:reverse(Acc), Rest, Length, true, Special}; tokenize([H | T]) when ?is_downcase(H); H == $_ -> {Acc, Rest, Length, Special} = tokenize_continue(T, [H], 1, []), {identifier, lists:reverse(Acc), Rest, Length, true, Special}; tokenize(_List) -> {error, empty}. tokenize_continue([$@ | T], Acc, Length, Special) -> tokenize_continue(T, [$@ | Acc], Length + 1, [$@ | lists:delete($@, Special)]); tokenize_continue([$! | T], Acc, Length, Special) -> {[$! | Acc], T, Length + 1, [$! | Special]}; tokenize_continue([$? | T], Acc, Length, Special) -> {[$? | Acc], T, Length + 1, [$? | Special]}; tokenize_continue([H | T], Acc, Length, Special) when ?is_upcase(H); ?is_downcase(H); ?is_digit(H); H == $_ -> tokenize_continue(T, [H | Acc], Length + 1, Special); tokenize_continue(Rest, Acc, Length, Special) -> {Acc, Rest, Length, Special}. tokenize_identifier(String, Line, Column, Scope) -> case (Scope#elixir_tokenizer.identifier_tokenizer):tokenize(String) of {Kind, Acc, Rest, Length, Ascii, Special} -> case unsafe_to_atom(Acc, Line, Column, Scope) of {ok, Atom} -> {Kind, Atom, Rest, Length, Ascii, Special}; {error, _Reason} = Error -> Error end; {error, {not_nfc, Wrong}} -> Right = unicode:characters_to_nfc_list(Wrong), RightCodepoints = list_to_codepoint_hex(Right), WrongCodepoints = list_to_codepoint_hex(Wrong), Message = io_lib:format("Elixir expects unquoted Unicode atoms and variables to be in NFC form.\n\n" "Got:\n\n \"~ts\" (codepoints~ts)\n\n" "Expected:\n\n \"~ts\" (codepoints~ts)\n\n" "Syntax error before: ", [Wrong, WrongCodepoints, Right, RightCodepoints]), {error, {Line, Column, Message, Wrong}}; {error, empty} -> empty end. list_to_codepoint_hex(List) -> [io_lib:format(" ~4.16.0B", [Codepoint]) || Codepoint <- List]. tokenize_alias(Rest, Line, Column, Atom, Length, Ascii, Special, Scope, Tokens) -> if not Ascii -> AtomName = atom_to_list(Atom), Invalid = hd([C || C <- AtomName, C > 127]), Reason = {Line, Column, invalid_character_error("alias (only ascii characters are allowed)", Invalid), AtomName}, {error, Reason, AtomName ++ Rest, Tokens}; Special /= [] -> AtomName = atom_to_list(Atom), Reason = {Line, Column, invalid_character_error("alias", hd(Special)), AtomName}, {error, Reason, AtomName ++ Rest, Tokens}; true -> AliasesToken = {alias, {Line, Column, nil}, Atom}, tokenize(Rest, Line, Column + Length, Scope, [AliasesToken | Tokens]) end. tokenize_other(Rest, Line, Column, Atom, Length, Scope, Tokens) -> case tokenize_keyword_or_identifier(Rest, Line, Column, Atom, Tokens) of {keyword, NewRest, NewCheck, NewTokens} -> handle_terminator(NewRest, Line, Column + Length, Scope, NewCheck, NewTokens); {identifier, NewRest, NewTokens} -> tokenize(NewRest, Line, Column + Length, Scope, NewTokens); {error, _, _, _} = Error -> Error end. tokenize_keyword_or_identifier(Rest, Line, Column, Atom, Tokens) -> case check_keyword(Line, Column, Atom, Tokens, Rest) of nomatch -> {identifier, Rest, [check_call_identifier(Line, Column, Atom, Rest) | Tokens]}; {ok, [{in_op, _, in} | [{unary_op, NotInfo, 'not'} | T]]} -> {keyword, Rest, {in_op, NotInfo, 'not in'}, T}; {ok, [Check | T]} -> {keyword, Rest, Check, T}; {error, Message, Token} -> {error, {Line, Column, Message, Token}, atom_to_list(Atom) ++ Rest, Tokens} end. %% Check if it is a call identifier (paren | bracket | do) check_call_identifier(Line, Column, Atom, [$( | _]) -> {paren_identifier, {Line, Column, nil}, Atom}; check_call_identifier(Line, Column, Atom, [$[ | _]) -> {bracket_identifier, {Line, Column, nil}, Atom}; check_call_identifier(Line, Column, Atom, _Rest) -> {identifier, {Line, Column, nil}, Atom}. add_token_with_eol({unary_op, _, _} = Left, T) -> [Left | T]; add_token_with_eol(Left, [{eol, _} | T]) -> [Left | T]; add_token_with_eol(Left, T) -> [Left | T]. previous_was_eol([{',', {_, _, Count}} | _]) when Count > 0 -> eol; previous_was_eol([{';', {_, _, Count}} | _]) when Count > 0 -> eol; previous_was_eol([{eol, {_, _, Count}} | _]) when Count > 0 -> eol; previous_was_eol(_) -> nil. %% Error handling interpolation_error(Reason, Rest, Tokens, Extension, Args) -> {error, interpolation_format(Reason, Extension, Args), Rest, Tokens}. interpolation_format({string, Line, Column, Message, Token}, Extension, Args) -> {Line, Column, [Message, io_lib:format(Extension, Args)], Token}; interpolation_format({_, _, _, _} = Reason, _Extension, _Args) -> Reason. %% Terminators handle_terminator(Rest, Line, Column, Scope, Token, Tokens) -> case handle_terminator(Token, Scope) of {error, Reason} -> {error, Reason, atom_to_list(element(1, Token)) ++ Rest, Tokens}; New -> tokenize(Rest, Line, Column, New, [Token | Tokens]) end. handle_terminator(_, #elixir_tokenizer{check_terminators=false} = Scope) -> Scope; handle_terminator(Token, #elixir_tokenizer{terminators=Terminators} = Scope) -> case check_terminator(Token, Terminators, Scope) of {error, _} = Error -> Error; NewScope -> NewScope end. check_terminator({Start, {Line, _, _}}, Terminators, Scope) when Start == '('; Start == '['; Start == '{'; Start == '<<' -> Indentation = Scope#elixir_tokenizer.indentation, Scope#elixir_tokenizer{terminators=[{Start, Line, Indentation} | Terminators]}; check_terminator({Start, {Line, _, _}}, Terminators, Scope) when Start == 'fn'; Start == 'do' -> Indentation = Scope#elixir_tokenizer.indentation, NewScope = case Terminators of %% If the do is indented equally or less than the previous do, it may be a missing end error! [{Start, _, PreviousIndentation} = Previous | _] when Indentation =< PreviousIndentation -> Scope#elixir_tokenizer{mismatch_hints=[Previous | Scope#elixir_tokenizer.mismatch_hints]}; _ -> Scope end, NewScope#elixir_tokenizer{terminators=[{Start, Line, Indentation} | Terminators]}; check_terminator({'end', {EndLine, _, _}}, [{'do', _, Indentation} | Terminators], Scope) -> NewScope = %% If the end is more indented than the do, it may be a missing do error! case Scope#elixir_tokenizer.indentation > Indentation of true -> Hint = {'end', EndLine, Scope#elixir_tokenizer.indentation}, Scope#elixir_tokenizer{mismatch_hints=[Hint | Scope#elixir_tokenizer.mismatch_hints]}; false -> Scope end, NewScope#elixir_tokenizer{terminators=Terminators}; check_terminator({End, _}, [{Start, _, _} | Terminators], Scope) when Start == 'fn', End == 'end'; Start == '(', End == ')'; Start == '[', End == ']'; Start == '{', End == '}'; Start == '<<', End == '>>' -> Scope#elixir_tokenizer{terminators=Terminators}; check_terminator({End, {EndLine, EndColumn, _}}, [{Start, StartLine, _} | _], Scope) when End == 'end'; End == ')'; End == ']'; End == '}'; End == '>>' -> ExpectedEnd = terminator(Start), Suffix = [io_lib:format(". The \"~ts\" at line ~B is missing terminator \"~ts\"", [Start, StartLine, ExpectedEnd]), missing_terminator_hint(Start, ExpectedEnd, Scope)], {error, {EndLine, EndColumn, {"unexpected token: ", Suffix}, [atom_to_list(End)]}}; check_terminator({'end', {Line, Column, _}}, [], #elixir_tokenizer{mismatch_hints=Hints}) -> Suffix = case lists:keyfind('end', 1, Hints) of {'end', HintLine, _Identation} -> io_lib:format("\n\n HINT: it looks like the \"end\" on line ~B " "does not have a matching \"do\" defined before it\n", [HintLine]); false -> "" end, {error, {Line, Column, {"unexpected token: ", Suffix}, "end"}}; check_terminator({End, {Line, Column, _}}, [], _Scope) when End == ')'; End == ']'; End == '}'; End == '>>' -> {error, {Line, Column, "unexpected token: ", atom_to_list(End)}}; check_terminator(_, _, Scope) -> Scope. missing_terminator_hint(Start, End, #elixir_tokenizer{mismatch_hints=Hints}) -> case lists:keyfind(Start, 1, Hints) of {Start, HintLine, _} -> io_lib:format("\n\n HINT: it looks like the \"~ts\" on line ~B does not have a matching \"~ts\"\n", [Start, HintLine, End]); false -> "" end. string_type($") -> bin_string; string_type($') -> list_string. heredoc_type($") -> bin_heredoc; heredoc_type($') -> list_heredoc. sigil_terminator($() -> $); sigil_terminator($[) -> $]; sigil_terminator(${) -> $}; sigil_terminator($<) -> $>; sigil_terminator(O) -> O. terminator('fn') -> 'end'; terminator('do') -> 'end'; terminator('(') -> ')'; terminator('[') -> ']'; terminator('{') -> '}'; terminator('<<') -> '>>'. %% Keywords checking check_keyword(_Line, _Column, _Atom, [{'.', _} | _], _Rest) -> nomatch; check_keyword(DoLine, DoColumn, do, [{identifier, {Line, Column, Meta}, Atom} | T], _Rest) -> {ok, add_token_with_eol({do, {DoLine, DoColumn, nil}}, [{do_identifier, {Line, Column, Meta}, Atom} | T])}; check_keyword(_Line, _Column, do, [{'fn', _} | _], _Rest) -> {error, invalid_do_with_fn_error("unexpected token: "), "do"}; check_keyword(Line, Column, do, Tokens, _Rest) -> case do_keyword_valid(Tokens) of true -> {ok, add_token_with_eol({do, {Line, Column, nil}}, Tokens)}; false -> {error, invalid_do_error("unexpected token: "), "do"} end; check_keyword(_Line, _Column, Atom, _Tokens, _Rest) when Atom == '__aliases__'; Atom == '__block__' -> {error, "reserved token: ", atom_to_list(Atom)}; check_keyword(Line, Column, Atom, Tokens, Rest) -> case keyword(Atom) of false -> nomatch; token -> {ok, [{Atom, {Line, Column, nil}} | Tokens]}; block -> {ok, [{block_identifier, {Line, Column, nil}, Atom} | Tokens]}; Kind -> case strip_horizontal_space(Rest, 0) of {[$/ | _], _} -> {ok, [{identifier, {Line, Column, nil}, Atom} | Tokens]}; _ -> {ok, add_token_with_eol({Kind, {Line, Column, previous_was_eol(Tokens)}, Atom}, Tokens)} end end. %% Fail early on invalid do syntax. For example, after %% most keywords, after comma and so on. do_keyword_valid([{Atom, _} | _]) -> case Atom of ',' -> false; ';' -> false; 'end' -> true; nil -> true; true -> true; false -> true; _ -> keyword(Atom) == false end; do_keyword_valid(_) -> true. % Regular keywords keyword('fn') -> token; keyword('end') -> token; keyword('true') -> token; keyword('false') -> token; keyword('nil') -> token; % Operators keywords keyword('not') -> unary_op; keyword('and') -> and_op; keyword('or') -> or_op; keyword('when') -> when_op; keyword('in') -> in_op; % Block keywords keyword('after') -> block; keyword('else') -> block; keyword('rescue') -> block; keyword('catch') -> block; keyword(_) -> false. invalid_character_error(What, Char) -> io_lib:format("invalid character \"~ts\" (codepoint U+~4.16.0B) in ~ts: ", [[Char], Char, What]). invalid_do_error(Prefix) -> {Prefix, ". In case you wanted to write a \"do\" expression, " "you must either use do-blocks or separate the keyword argument with comma. " "For example, you should either write:\n\n" " if some_condition? do\n" " :this\n" " else\n" " :that\n" " end\n\n" "or the equivalent construct:\n\n" " if(some_condition?, do: :this, else: :that)\n\n" "where \"some_condition?\" is the first argument and the second argument is a keyword list"}. invalid_do_with_fn_error(Prefix) -> {Prefix, ". Anonymous functions are written as:\n\n" " fn pattern -> expression end"}. % TODO: Turn into an error on Elixir 2.0. maybe_warn_too_many_of_same_char([T | _] = Token, [T | _] = _Rest, Line, Scope) -> Warning = case T of $. -> "please use parens around \"...\" instead"; _ -> io_lib:format("please use a space between \"~ts\" and the next \"~ts\"", [Token, [T]]) end, Message = io_lib:format("found \"~ts\" followed by \"~ts\", ~ts", [Token, [T], Warning]), elixir_errors:warn(Line, Scope#elixir_tokenizer.file, Message); maybe_warn_too_many_of_same_char(_Token, _Rest, _Line, _Scope) -> ok. %% TODO: Turn into an error on Elixir v2.0 maybe_warn_for_ambiguous_bang_before_equals(Kind, Atom, [$= | _], Scope, Line) -> {What, Identifier} = case Kind of atom -> {"atom", [$: | atom_to_list(Atom)]}; identifier -> {"identifier", atom_to_list(Atom)} end, case lists:last(Identifier) of Last when Last == $!; Last == $? -> Msg = io_lib:format("found ~ts \"~ts\", ending with \"~ts\", followed by =. " "It is unclear if you mean \"~ts ~ts=\" or \"~ts =\". Please add " "a space before or after ~ts to remove the ambiguity", [What, Identifier, [Last], lists:droplast(Identifier), [Last], Identifier, [Last]]), elixir_errors:warn(Line, Scope#elixir_tokenizer.file, Msg); _ -> ok end; maybe_warn_for_ambiguous_bang_before_equals(_Kind, _Atom, _Rest, _Scope, _Line) -> ok.