-module(elixir_tokenizer).
-include("elixir.hrl").
-include("elixir_tokenizer.hrl").
-export([tokenize/1, tokenize/3, tokenize/4, invalid_do_error/1]).

-define(at_op(T),
  T =:= $@).

-define(capture_op(T),
  T =:= $&).

-define(unary_op(T),
  T =:= $!;
  T =:= $^).

-define(range_op(T1, T2),
  T1 =:= $., T2 =:= $.).

-define(concat_op(T1, T2),
  T1 =:= $+, T2 =:= $+;
  T1 =:= $-, T2 =:= $-;
  T1 =:= $<, T2 =:= $>).

-define(concat_op3(T1, T2, T3),
  T1 =:= $+, T2 =:= $+, T3 =:= $+;
  T1 =:= $-, T2 =:= $-, T3 =:= $-).

-define(power_op(T1, T2),
  T1 =:= $*, T2 =:= $*).

-define(mult_op(T),
  T =:= $* orelse T =:= $/).

-define(dual_op(T),
  T =:= $+ orelse T =:= $-).

-define(arrow_op3(T1, T2, T3),
  T1 =:= $<, T2 =:= $<, T3 =:= $<;
  T1 =:= $>, T2 =:= $>, T3 =:= $>;
  T1 =:= $~, T2 =:= $>, T3 =:= $>;
  T1 =:= $<, T2 =:= $<, T3 =:= $~;
  T1 =:= $<, T2 =:= $~, T3 =:= $>;
  T1 =:= $<, T2 =:= $|, T3 =:= $>).

-define(arrow_op(T1, T2),
  T1 =:= $|, T2 =:= $>;
  T1 =:= $~, T2 =:= $>;
  T1 =:= $<, T2 =:= $~).

-define(rel_op(T),
  T =:= $<;
  T =:= $>).

-define(rel_op2(T1, T2),
  T1 =:= $<, T2 =:= $=;
  T1 =:= $>, T2 =:= $=).

-define(comp_op2(T1, T2),
  T1 =:= $=, T2 =:= $=;
  T1 =:= $=, T2 =:= $~;
  T1 =:= $!, T2 =:= $=).

-define(comp_op3(T1, T2, T3),
  T1 =:= $=, T2 =:= $=, T3 =:= $=;
  T1 =:= $!, T2 =:= $=, T3 =:= $=).

-define(ternary_op(T1, T2),
  T1 =:= $/, T2 =:= $/).

-define(and_op(T1, T2),
  T1 =:= $&, T2 =:= $&).

-define(or_op(T1, T2),
  T1 =:= $|, T2 =:= $|).

-define(and_op3(T1, T2, T3),
  T1 =:= $&, T2 =:= $&, T3 =:= $&).

-define(or_op3(T1, T2, T3),
  T1 =:= $|, T2 =:= $|, T3 =:= $|).

-define(match_op(T),
  T =:= $=).

-define(in_match_op(T1, T2),
  T1 =:= $<, T2 =:= $-;
  T1 =:= $\\, T2 =:= $\\).

-define(stab_op(T1, T2),
  T1 =:= $-, T2 =:= $>).

-define(type_op(T1, T2),
  T1 =:= $:, T2 =:= $:).

-define(pipe_op(T),
  T =:= $|).

%% Deprecated operators

-define(unary_op3(T1, T2, T3),
  T1 =:= $~, T2 =:= $~, T3 =:= $~).

-define(xor_op3(T1, T2, T3),
  T1 =:= $^, T2 =:= $^, T3 =:= $^).

tokenize(String, Line, Column, #elixir_tokenizer{} = Scope) ->
  tokenize(String, Line, Column, Scope, []);

tokenize(String, Line, Column, Opts) ->
  IdentifierTokenizer = elixir_config:identifier_tokenizer(),

  Scope =
    lists:foldl(fun
      ({check_terminators, false}, Acc) ->
        Acc#elixir_tokenizer{terminators=none};
      ({cursor_completion, true}, Acc) ->
        Acc#elixir_tokenizer{cursor_completion=prune_and_cursor};
      ({existing_atoms_only, ExistingAtomsOnly}, Acc) when is_boolean(ExistingAtomsOnly) ->
        Acc#elixir_tokenizer{existing_atoms_only=ExistingAtomsOnly};
      ({static_atoms_encoder, StaticAtomsEncoder}, Acc) when is_function(StaticAtomsEncoder) ->
        Acc#elixir_tokenizer{static_atoms_encoder=StaticAtomsEncoder};
      ({preserve_comments, PreserveComments}, Acc) when is_function(PreserveComments) ->
        Acc#elixir_tokenizer{preserve_comments=PreserveComments};
      ({unescape, Unescape}, Acc) when is_boolean(Unescape) ->
        Acc#elixir_tokenizer{unescape=Unescape};
      ({warn_on_unnecessary_quotes, Unnecessary}, Acc) when is_boolean(Unnecessary) ->
        Acc#elixir_tokenizer{warn_on_unnecessary_quotes=Unnecessary};
      (_, Acc) ->
        Acc
    end, #elixir_tokenizer{identifier_tokenizer=IdentifierTokenizer}, Opts),

  tokenize(String, Line, Column, Scope, []).

tokenize(String, Line, Opts) ->
  tokenize(String, Line, 1, Opts).

tokenize([], Line, Column, #elixir_tokenizer{cursor_completion=Cursor} = Scope, Tokens) when Cursor /= false ->
  #elixir_tokenizer{ascii_identifiers_only=Ascii, terminators=Terminators, warnings=Warnings} = Scope,

  {CursorColumn, CursorTerminators, CursorTokens} =
    add_cursor(Line, Column, Cursor, Terminators, Tokens),

  AllWarnings = maybe_unicode_lint_warnings(Ascii, Tokens, Warnings),
  AccTokens = cursor_complete(Line, CursorColumn, CursorTerminators, CursorTokens),
  {ok, Line, Column, AllWarnings, AccTokens};

tokenize([], EndLine, Column, #elixir_tokenizer{terminators=[{Start, StartLine, _} | _]} = Scope, Tokens) ->
  End = terminator(Start),
  Hint = missing_terminator_hint(Start, End, Scope),
  Message = "missing terminator: ~ts (for \"~ts\" starting at line ~B)",
  Formatted = io_lib:format(Message, [End, Start, StartLine]),
  error({EndLine, Column, [Formatted, Hint], []}, [], Scope, Tokens);

tokenize([], Line, Column, #elixir_tokenizer{} = Scope, Tokens) ->
  #elixir_tokenizer{ascii_identifiers_only=Ascii, warnings=Warnings} = Scope,
  AllWarnings = maybe_unicode_lint_warnings(Ascii, Tokens, Warnings),
  {ok, Line, Column, AllWarnings, lists:reverse(Tokens)};

% VC merge conflict

tokenize(("<<<<<<<" ++ _) = Original, Line, 1, Scope, Tokens) ->
  FirstLine = lists:takewhile(fun(C) -> C =/= $\n andalso C =/= $\r end, Original),
  Reason = {Line, 1, "found an unexpected version control marker, please resolve the conflicts: ", FirstLine},
  error(Reason, Original, Scope, Tokens);

% Base integers

tokenize([$0, $x, H | T], Line, Column, Scope, Tokens) when ?is_hex(H) ->
  {Rest, Number, OriginalRepresentation, Length} = tokenize_hex(T, [H], 1),
  Token = {int, {Line, Column, Number}, OriginalRepresentation},
  tokenize(Rest, Line, Column + 2 + Length, Scope, [Token | Tokens]);

tokenize([$0, $b, H | T], Line, Column, Scope, Tokens) when ?is_bin(H) ->
  {Rest, Number, OriginalRepresentation, Length} = tokenize_bin(T, [H], 1),
  Token = {int, {Line, Column, Number}, OriginalRepresentation},
  tokenize(Rest, Line, Column + 2 + Length, Scope, [Token | Tokens]);

tokenize([$0, $o, H | T], Line, Column, Scope, Tokens) when ?is_octal(H) ->
  {Rest, Number, OriginalRepresentation, Length} = tokenize_octal(T, [H], 1),
  Token = {int, {Line, Column, Number}, OriginalRepresentation},
  tokenize(Rest, Line, Column + 2 + Length, Scope, [Token | Tokens]);

% Comments

tokenize([$# | String], Line, Column, Scope, Tokens) ->
  case tokenize_comment(String, [$#]) of
    {error, Char} ->
      error_comment(Char, [$# | String], Line, Column, Scope, Tokens);
    {Rest, Comment} ->
      preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
      tokenize(Rest, Line, Column, Scope, reset_eol(Tokens))
  end;

% Sigils

tokenize([$~, H | _T] = Original, Line, Column, Scope, Tokens) when ?is_upcase(H) orelse ?is_downcase(H) ->
  tokenize_sigil(Original, Line, Column, Scope, Tokens);

% Char tokens

% We tokenize char literals (?a) as {char, _, CharInt} instead of {number, _,
% CharInt}. This is exactly what Erlang does with Erlang char literals
% ($a). This means we'll have to adjust the error message for char literals in
% elixir_errors.erl as by default {char, _, _} tokens are "hijacked" by Erlang
% and printed with Erlang syntax ($a) in the parser's error messages.

tokenize([$?, $\\, H | T], Line, Column, Scope, Tokens) ->
  Char = elixir_interpolation:unescape_map(H),

  NewScope = if
    H =:= Char, H =/= $\\ ->
      case handle_char(Char) of
        {Escape, Name} ->
          Msg = io_lib:format("found ?\\ followed by code point 0x~.16B (~ts), please use ?~ts instead",
                              [Char, Name, Escape]),
          prepend_warning(Line, Column, Msg, Scope);

        false when ?is_downcase(H); ?is_upcase(H) ->
          Msg = io_lib:format("unknown escape sequence ?\\~tc, use ?~tc instead", [H, H]),
          prepend_warning(Line, Column, Msg, Scope);

        false ->
          Scope
      end;
    true ->
      Scope
  end,

  Token = {char, {Line, Column, [$?, $\\, H]}, Char},
  tokenize(T, Line, Column + 3, NewScope, [Token | Tokens]);

tokenize([$?, Char | T], Line, Column, Scope, Tokens) ->
  NewScope = case handle_char(Char) of
    {Escape, Name} ->
      Msg = io_lib:format("found ? followed by code point 0x~.16B (~ts), please use ?~ts instead",
                          [Char, Name, Escape]),
      prepend_warning(Line, Column, Msg, Scope);
    false ->
      Scope
  end,
  Token = {char, {Line, Column, [$?, Char]}, Char},
  tokenize(T, Line, Column + 2, NewScope, [Token | Tokens]);

% Heredocs

tokenize("\"\"\"" ++ T, Line, Column, Scope, Tokens) ->
  handle_heredocs(T, Line, Column, $", Scope, Tokens);

%% TODO: Deprecate single-quoted in Elixir v1.17
tokenize("'''" ++ T, Line, Column, Scope, Tokens) ->
  handle_heredocs(T, Line, Column, $', Scope, Tokens);

% Strings

tokenize([$" | T], Line, Column, Scope, Tokens) ->
  handle_strings(T, Line, Column + 1, $", Scope, Tokens);

%% TODO: Deprecate single-quoted in Elixir v1.17
tokenize([$' | T], Line, Column, Scope, Tokens) ->
  handle_strings(T, Line, Column + 1, $', Scope, Tokens);

% Operator atoms

tokenize(".:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) ->
  tokenize(Rest, Line, Column + 2, Scope, [{kw_identifier, {Line, Column, nil}, '.'} | Tokens]);

tokenize("...:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) ->
  tokenize(Rest, Line, Column + 4, Scope, [{kw_identifier, {Line, Column, nil}, '...'} | Tokens]);
tokenize("<<>>:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) ->
  tokenize(Rest, Line, Column + 5, Scope, [{kw_identifier, {Line, Column, nil}, '<<>>'} | Tokens]);
tokenize("%{}:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) ->
  tokenize(Rest, Line, Column + 4, Scope, [{kw_identifier, {Line, Column, nil}, '%{}'} | Tokens]);
tokenize("%:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) ->
  tokenize(Rest, Line, Column + 2, Scope, [{kw_identifier, {Line, Column, nil}, '%'} | Tokens]);
tokenize("&:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) ->
  tokenize(Rest, Line, Column + 2, Scope, [{kw_identifier, {Line, Column, nil}, '&'} | Tokens]);
tokenize("{}:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) ->
  tokenize(Rest, Line, Column + 3, Scope, [{kw_identifier, {Line, Column, nil}, '{}'} | Tokens]);
tokenize("..//:" ++ Rest, Line, Column, Scope, Tokens) when ?is_space(hd(Rest)) ->
  tokenize(Rest, Line, Column + 5, Scope, [{kw_identifier, {Line, Column, nil}, '..//'} | Tokens]);

tokenize(":..." ++ Rest, Line, Column, Scope, Tokens) ->
  tokenize(Rest, Line, Column + 4, Scope, [{atom, {Line, Column, nil}, '...'} | Tokens]);
tokenize(":<<>>" ++ Rest, Line, Column, Scope, Tokens) ->
  tokenize(Rest, Line, Column + 5, Scope, [{atom, {Line, Column, nil}, '<<>>'} | Tokens]);
tokenize(":%{}" ++ Rest, Line, Column, Scope, Tokens) ->
  tokenize(Rest, Line, Column + 4, Scope, [{atom, {Line, Column, nil}, '%{}'} | Tokens]);
tokenize(":%" ++ Rest, Line, Column, Scope, Tokens) ->
  tokenize(Rest, Line, Column + 2, Scope, [{atom, {Line, Column, nil}, '%'} | Tokens]);
tokenize(":{}" ++ Rest, Line, Column, Scope, Tokens) ->
  tokenize(Rest, Line, Column + 3, Scope, [{atom, {Line, Column, nil}, '{}'} | Tokens]);
tokenize(":..//" ++ Rest, Line, Column, Scope, Tokens) ->
  tokenize(Rest, Line, Column + 5, Scope, [{atom, {Line, Column, nil}, '..//'} | Tokens]);

% ## Three Token Operators
tokenize([$:, T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when
    ?unary_op3(T1, T2, T3); ?comp_op3(T1, T2, T3); ?and_op3(T1, T2, T3); ?or_op3(T1, T2, T3);
    ?arrow_op3(T1, T2, T3); ?xor_op3(T1, T2, T3); ?concat_op3(T1, T2, T3) ->
  Token = {atom, {Line, Column, nil}, list_to_atom([T1, T2, T3])},
  tokenize(Rest, Line, Column + 4, Scope, [Token | Tokens]);

% ## Two Token Operators

tokenize([$:, $:, $: | Rest], Line, Column, Scope, Tokens) ->
  Message = "atom ::: must be written between quotes, as in :\"::\", to avoid ambiguity",
  NewScope = prepend_warning(Line, Column, Message, Scope),
  Token = {atom, {Line, Column, nil}, '::'},
  tokenize(Rest, Line, Column + 3, NewScope, [Token | Tokens]);

tokenize([$:, T1, T2 | Rest], Line, Column, Scope, Tokens) when
    ?comp_op2(T1, T2); ?rel_op2(T1, T2); ?and_op(T1, T2); ?or_op(T1, T2);
    ?arrow_op(T1, T2); ?in_match_op(T1, T2); ?concat_op(T1, T2); ?power_op(T1, T2);
    ?stab_op(T1, T2); ?range_op(T1, T2) ->
  Token = {atom, {Line, Column, nil}, list_to_atom([T1, T2])},
  tokenize(Rest, Line, Column + 3, Scope, [Token | Tokens]);

% ## Single Token Operators
tokenize([$:, T | Rest], Line, Column, Scope, Tokens) when
    ?at_op(T); ?unary_op(T); ?capture_op(T); ?dual_op(T); ?mult_op(T);
    ?rel_op(T); ?match_op(T); ?pipe_op(T); T =:= $. ->
  Token = {atom, {Line, Column, nil}, list_to_atom([T])},
  tokenize(Rest, Line, Column + 2, Scope, [Token | Tokens]);

% ## Stand-alone tokens

%% TODO: Consider either making ... as nullary operator (same as ..)
%% or deprecating it. In Elixir itself it is only used in typespecs.
tokenize("..." ++ Rest, Line, Column, Scope, Tokens) ->
  NewScope = maybe_warn_too_many_of_same_char("...", Rest, Line, Column, Scope),
  Token = check_call_identifier(Line, Column, "...", '...', Rest),
  tokenize(Rest, Line, Column + 3, NewScope, [Token | Tokens]);

tokenize("=>" ++ Rest, Line, Column, Scope, Tokens) ->
  Token = {assoc_op, {Line, Column, previous_was_eol(Tokens)}, '=>'},
  tokenize(Rest, Line, Column + 2, Scope, add_token_with_eol(Token, Tokens));

tokenize("..//" ++ Rest = String, Line, Column, Scope, Tokens) ->
  case strip_horizontal_space(Rest, 0) of
    {[$/ | _] = Remaining, Extra} ->
      Token = {identifier, {Line, Column, nil}, '..//'},
      tokenize(Remaining, Line, Column + 4 + Extra, Scope, [Token | Tokens]);
    {_, _} ->
      unexpected_token(String, Line, Column, Scope, Tokens)
  end;

% ## Ternary operator

% ## Three token operators
tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?unary_op3(T1, T2, T3) ->
  handle_unary_op(Rest, Line, Column, unary_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens);

tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?comp_op3(T1, T2, T3) ->
  handle_op(Rest, Line, Column, comp_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens);

tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?and_op3(T1, T2, T3) ->
  NewScope = maybe_warn_too_many_of_same_char([T1, T2, T3], Rest, Line, Column, Scope),
  handle_op(Rest, Line, Column, and_op, 3, list_to_atom([T1, T2, T3]), NewScope, Tokens);

tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?or_op3(T1, T2, T3) ->
  NewScope = maybe_warn_too_many_of_same_char([T1, T2, T3], Rest, Line, Column, Scope),
  handle_op(Rest, Line, Column, or_op, 3, list_to_atom([T1, T2, T3]), NewScope, Tokens);

tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?xor_op3(T1, T2, T3) ->
  NewScope = maybe_warn_too_many_of_same_char([T1, T2, T3], Rest, Line, Column, Scope),
  handle_op(Rest, Line, Column, xor_op, 3, list_to_atom([T1, T2, T3]), NewScope, Tokens);

tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?concat_op3(T1, T2, T3) ->
  NewScope = maybe_warn_too_many_of_same_char([T1, T2, T3], Rest, Line, Column, Scope),
  handle_op(Rest, Line, Column, concat_op, 3, list_to_atom([T1, T2, T3]), NewScope, Tokens);

tokenize([T1, T2, T3 | Rest], Line, Column, Scope, Tokens) when ?arrow_op3(T1, T2, T3) ->
  handle_op(Rest, Line, Column, arrow_op, 3, list_to_atom([T1, T2, T3]), Scope, Tokens);

% ## Containers + punctuation tokens
tokenize([$, | Rest], Line, Column, Scope, Tokens) ->
  Token = {',', {Line, Column, 0}},
  tokenize(Rest, Line, Column + 1, Scope, [Token | Tokens]);

tokenize([$<, $< | Rest], Line, Column, Scope, Tokens) ->
  Token = {'<<', {Line, Column, nil}},
  handle_terminator(Rest, Line, Column + 2, Scope, Token, Tokens);

tokenize([$>, $> | Rest], Line, Column, Scope, Tokens) ->
  Token = {'>>', {Line, Column, previous_was_eol(Tokens)}},
  handle_terminator(Rest, Line, Column + 2, Scope, Token, Tokens);

tokenize([${ | Rest], Line, Column, Scope, [{'%', _} | _] = Tokens) ->
  Message =
    "unexpected space between % and {\n\n"
    "If you want to define a map, write %{...}, with no spaces.\n"
    "If you want to define a struct, write %StructName{...}.\n\n"
    "Syntax error before: ",
  error({Line, Column, Message, [${]}, Rest, Scope, Tokens);

tokenize([T | Rest], Line, Column, Scope, Tokens) when T =:= $(; T =:= ${; T =:= $[ ->
  Token = {list_to_atom([T]), {Line, Column, nil}},
  handle_terminator(Rest, Line, Column + 1, Scope, Token, Tokens);

tokenize([T | Rest], Line, Column, Scope, Tokens) when T =:= $); T =:= $}; T =:= $] ->
  Token = {list_to_atom([T]), {Line, Column, previous_was_eol(Tokens)}},
  handle_terminator(Rest, Line, Column + 1, Scope, Token, Tokens);

% ## Two Token Operators
tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?ternary_op(T1, T2) ->
  Op = list_to_atom([T1, T2]),
  Token = {ternary_op, {Line, Column, previous_was_eol(Tokens)}, Op},
  tokenize(Rest, Line, Column + 2, Scope, add_token_with_eol(Token, Tokens));

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?power_op(T1, T2) ->
  handle_op(Rest, Line, Column, power_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?range_op(T1, T2) ->
  handle_op(Rest, Line, Column, range_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?concat_op(T1, T2) ->
  handle_op(Rest, Line, Column, concat_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?arrow_op(T1, T2) ->
  handle_op(Rest, Line, Column, arrow_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?comp_op2(T1, T2) ->
  handle_op(Rest, Line, Column, comp_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?rel_op2(T1, T2) ->
  handle_op(Rest, Line, Column, rel_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?and_op(T1, T2) ->
  handle_op(Rest, Line, Column, and_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?or_op(T1, T2) ->
  handle_op(Rest, Line, Column, or_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?in_match_op(T1, T2) ->
  handle_op(Rest, Line, Column, in_match_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?type_op(T1, T2) ->
  handle_op(Rest, Line, Column, type_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

tokenize([T1, T2 | Rest], Line, Column, Scope, Tokens) when ?stab_op(T1, T2) ->
  handle_op(Rest, Line, Column, stab_op, 2, list_to_atom([T1, T2]), Scope, Tokens);

% ## Single Token Operators

tokenize([$& | Rest], Line, Column, Scope, Tokens) ->
  Kind =
    case strip_horizontal_space(Rest, 0) of
      {[Int | _], 0} when ?is_digit(Int) ->
        capture_int;

      {[$/ | NewRest], _} ->
        case strip_horizontal_space(NewRest, 0) of
          {[$/ | _], _} -> capture_op;
          {_, _} -> identifier
        end;

      {_, _} ->
        capture_op
    end,

  Token = {Kind, {Line, Column, nil}, '&'},
  tokenize(Rest, Line, Column + 1, Scope, [Token | Tokens]);

tokenize([T | Rest], Line, Column, Scope, Tokens) when ?at_op(T) ->
  handle_unary_op(Rest, Line, Column, at_op, 1, list_to_atom([T]), Scope, Tokens);

tokenize([T | Rest], Line, Column, Scope, Tokens) when ?unary_op(T) ->
  handle_unary_op(Rest, Line, Column, unary_op, 1, list_to_atom([T]), Scope, Tokens);

tokenize([T | Rest], Line, Column, Scope, Tokens) when ?rel_op(T) ->
  handle_op(Rest, Line, Column, rel_op, 1, list_to_atom([T]), Scope, Tokens);

tokenize([T | Rest], Line, Column, Scope, Tokens) when ?dual_op(T) ->
  handle_unary_op(Rest, Line, Column, dual_op, 1, list_to_atom([T]), Scope, Tokens);

tokenize([T | Rest], Line, Column, Scope, Tokens) when ?mult_op(T) ->
  handle_op(Rest, Line, Column, mult_op, 1, list_to_atom([T]), Scope, Tokens);

tokenize([T | Rest], Line, Column, Scope, Tokens) when ?match_op(T) ->
  handle_op(Rest, Line, Column, match_op, 1, list_to_atom([T]), Scope, Tokens);

tokenize([T | Rest], Line, Column, Scope, Tokens) when ?pipe_op(T) ->
  handle_op(Rest, Line, Column, pipe_op, 1, list_to_atom([T]), Scope, Tokens);

% Non-operator Atoms

tokenize([$:, H | T] = Original, Line, Column, Scope, Tokens) when ?is_quote(H) ->
  case elixir_interpolation:extract(Line, Column + 2, Scope, true, T, H) of
    {NewLine, NewColumn, Parts, Rest, InterScope} ->
      NewScope = case is_unnecessary_quote(Parts, InterScope) of
        true ->
          WarnMsg = io_lib:format(
            "found quoted atom \"~ts\" but the quotes are not required. "
            "Atoms made exclusively of ASCII letters, numbers, underscores, "
            "beginning with a letter or underscore, and optionally ending with ! or ? "
            "do not require quotes",
            [hd(Parts)]
          ),
          prepend_warning(Line, Column, WarnMsg, InterScope);

        false ->
          InterScope
      end,

      case unescape_tokens(Parts, Line, Column, NewScope) of
        {ok, [Part]} when is_binary(Part) ->
          case unsafe_to_atom(Part, Line, Column, Scope) of
            {ok, Atom} ->
              Token = {atom_quoted, {Line, Column, nil}, Atom},
              tokenize(Rest, NewLine, NewColumn, NewScope, [Token | Tokens]);

            {error, Reason} ->
              error(Reason, Rest, NewScope, Tokens)
          end;

        {ok, Unescaped} ->
          Key = case Scope#elixir_tokenizer.existing_atoms_only of
            true  -> atom_safe;
            false -> atom_unsafe
          end,
          Token = {Key, {Line, Column, nil}, Unescaped},
          tokenize(Rest, NewLine, NewColumn, NewScope, [Token | Tokens]);

        {error, Reason} ->
          error(Reason, Rest, NewScope, Tokens)
      end;

    {error, Reason} ->
      Message = " (for atom starting at line ~B)",
      interpolation_error(Reason, Original, Scope, Tokens, Message, [Line])
  end;

tokenize([$: | String] = Original, Line, Column, Scope, Tokens) ->
  case tokenize_identifier(String, Line, Column, Scope, false) of
    {_Kind, Unencoded, Atom, Rest, Length, Ascii, _Special} ->
      NewScope = maybe_warn_for_ambiguous_bang_before_equals(atom, Unencoded, Rest, Line, Column, Scope),
      TrackedScope = track_ascii(Ascii, NewScope),
      Token = {atom, {Line, Column, Unencoded}, Atom},
      tokenize(Rest, Line, Column + 1 + Length, TrackedScope, [Token | Tokens]);
    empty when Scope#elixir_tokenizer.cursor_completion == false ->
      unexpected_token(Original, Line, Column, Scope, Tokens);
    empty ->
      tokenize([], Line, Column, Scope, Tokens);
    {unexpected_token, Length} ->
      unexpected_token(lists:nthtail(Length - 1, String), Line, Column + Length - 1, Scope, Tokens);
    {error, Reason} ->
      error(Reason, Original, Scope, Tokens)
  end;

% Integers and floats
% We use int and flt otherwise elixir_parser won't format them
% properly in case of errors.

tokenize([H | T], Line, Column, Scope, Tokens) when ?is_digit(H) ->
  case tokenize_number(T, [H], 1, false) of
    {error, Reason, Original} ->
      error({Line, Column, Reason, Original}, T, Scope, Tokens);
    {[I | Rest], Number, Original, _Length} when ?is_upcase(I); ?is_downcase(I); I == $_ ->
      if
        Number == 0, (I =:= $x) orelse (I =:= $o) orelse (I =:= $b), Rest == [],
        Scope#elixir_tokenizer.cursor_completion /= false ->
          tokenize([], Line, Column, Scope, Tokens);

        true ->
          Msg =
            io_lib:format(
              "invalid character \"~ts\" after number ~ts. If you intended to write a number, "
              "make sure to separate the number from the character (using comma, space, etc). "
              "If you meant to write a function name or a variable, note that identifiers in "
              "Elixir cannot start with numbers. Unexpected token: ",
              [[I], Original]
            ),

          error({Line, Column, Msg, [I]}, T, Scope, Tokens)
      end;
    {Rest, Number, Original, Length} when is_integer(Number) ->
      Token = {int, {Line, Column, Number}, Original},
      tokenize(Rest, Line, Column + Length, Scope, [Token | Tokens]);
    {Rest, Number, Original, Length} ->
      Token = {flt, {Line, Column, Number}, Original},
      tokenize(Rest, Line, Column + Length, Scope, [Token | Tokens])
  end;

% Spaces

tokenize([T | Rest], Line, Column, Scope, Tokens) when ?is_horizontal_space(T) ->
  {Remaining, Stripped} = strip_horizontal_space(Rest, 0),
  handle_space_sensitive_tokens(Remaining, Line, Column + 1 + Stripped, Scope, Tokens);

% End of line

tokenize(";" ++ Rest, Line, Column, Scope, []) ->
  tokenize(Rest, Line, Column + 1, Scope, [{';', {Line, Column, 0}}]);

tokenize(";" ++ Rest, Line, Column, Scope, [Top | _] = Tokens) when element(1, Top) /= ';' ->
  tokenize(Rest, Line, Column + 1, Scope, [{';', {Line, Column, 0}} | Tokens]);

tokenize("\\" = Original, Line, Column, Scope, Tokens) ->
  error({Line, Column, "invalid escape \\ at end of file", []}, Original, Scope, Tokens);

tokenize("\\\n" = Original, Line, Column, Scope, Tokens) ->
  error({Line, Column, "invalid escape \\ at end of file", []}, Original, Scope, Tokens);

tokenize("\\\r\n" = Original, Line, Column, Scope, Tokens) ->
  error({Line, Column, "invalid escape \\ at end of file", []}, Original, Scope, Tokens);

tokenize("\\\n" ++ Rest, Line, _Column, Scope, Tokens) ->
  tokenize_eol(Rest, Line, Scope, Tokens);

tokenize("\\\r\n" ++ Rest, Line, _Column, Scope, Tokens) ->
  tokenize_eol(Rest, Line, Scope, Tokens);

tokenize("\n" ++ Rest, Line, Column, Scope, Tokens) ->
  tokenize_eol(Rest, Line, Scope, eol(Line, Column, Tokens));

tokenize("\r\n" ++ Rest, Line, Column, Scope, Tokens) ->
  tokenize_eol(Rest, Line, Scope, eol(Line, Column, Tokens));

% Others

tokenize([$%, $( | Rest], Line, Column, Scope, Tokens) ->
  Reason = {Line, Column, "expected %{ to define a map, got: ", [$%, $(]},
  error(Reason, Rest, Scope, Tokens);

tokenize([$%, $[ | Rest], Line, Column, Scope, Tokens) ->
  Reason = {Line, Column, "expected %{ to define a map, got: ", [$%, $[]},
  error(Reason, Rest, Scope, Tokens);

tokenize([$%, ${ | T], Line, Column, Scope, Tokens) ->
  tokenize([${ | T], Line, Column + 1, Scope, [{'%{}', {Line, Column, nil}} | Tokens]);

tokenize([$% | T], Line, Column, Scope, Tokens) ->
  tokenize(T, Line, Column + 1, Scope, [{'%', {Line, Column, nil}} | Tokens]);

tokenize([$. | T], Line, Column, Scope, Tokens) ->
  tokenize_dot(T, Line, Column + 1, {Line, Column, nil}, Scope, Tokens);

% Identifiers

tokenize(String, Line, Column, OriginalScope, Tokens) ->
  case tokenize_identifier(String, Line, Column, OriginalScope, not previous_was_dot(Tokens)) of
    {Kind, Unencoded, Atom, Rest, Length, Ascii, Special} ->
      HasAt = lists:member(at, Special),
      Scope = track_ascii(Ascii, OriginalScope),

      case Rest of
        [$: | T] when ?is_space(hd(T)) ->
          Token = {kw_identifier, {Line, Column, Unencoded}, Atom},
          tokenize(T, Line, Column + Length + 1, Scope, [Token | Tokens]);

        [$: | T] when hd(T) =/= $: ->
          AtomName = atom_to_list(Atom) ++ [$:],
          Reason = {Line, Column, "keyword argument must be followed by space after: ", AtomName},
          error(Reason, String, Scope, Tokens);

        _ when HasAt ->
          Reason = {Line, Column, invalid_character_error(Kind, $@), atom_to_list(Atom)},
          error(Reason, String, Scope, Tokens);

        _ when Atom == '__aliases__'; Atom == '__block__' ->
          error({Line, Column, "reserved token: ", atom_to_list(Atom)}, Rest, Scope, Tokens);

        _ when Kind == alias ->
          tokenize_alias(Rest, Line, Column, Unencoded, Atom, Length, Ascii, Special, Scope, Tokens);

        _ when Kind == identifier ->
          NewScope = maybe_warn_for_ambiguous_bang_before_equals(identifier, Unencoded, Rest, Line, Column, Scope),
          Token = check_call_identifier(Line, Column, Unencoded, Atom, Rest),
          tokenize(Rest, Line, Column + Length, NewScope, [Token | Tokens]);

        _ ->
          unexpected_token(String, Line, Column, Scope, Tokens)
      end;

    {keyword, Atom, Type, Rest, Length} ->
      tokenize_keyword(Type, Rest, Line, Column, Atom, Length, OriginalScope, Tokens);

    empty when OriginalScope#elixir_tokenizer.cursor_completion == false ->
      unexpected_token(String, Line, Column, OriginalScope, Tokens);

    empty  ->
      case String of
        [$~, L] when ?is_upcase(L); ?is_downcase(L) -> tokenize([], Line, Column, OriginalScope, Tokens);
        [$~] -> tokenize([], Line, Column, OriginalScope, Tokens);
        _ -> unexpected_token(String, Line, Column, OriginalScope, Tokens)
      end;

    {unexpected_token, Length} ->
      unexpected_token(lists:nthtail(Length - 1, String), Line, Column + Length - 1, OriginalScope, Tokens);

    {error, Reason} ->
      error(Reason, String, OriginalScope, Tokens)
  end.

previous_was_dot([{'.', _} | _]) -> true;
previous_was_dot(_) -> false.

unexpected_token([T | Rest], Line, Column, Scope, Tokens) ->
  Message =
    case handle_char(T) of
      {_Escaped, Explanation} ->
        io_lib:format("~ts (column ~p, code point U+~4.16.0B)", [Explanation, Column, T]);
      false ->
        io_lib:format("\"~ts\" (column ~p, code point U+~4.16.0B)", [[T], Column, T])
    end,
  error({Line, Column, "unexpected token: ", Message}, Rest, Scope, Tokens).

tokenize_eol(Rest, Line, Scope, Tokens) ->
  {StrippedRest, Indentation} = strip_horizontal_space(Rest, 0),
  IndentedScope = Scope#elixir_tokenizer{indentation=Indentation},
  tokenize(StrippedRest, Line + 1, Indentation + 1, IndentedScope, Tokens).

strip_horizontal_space([H | T], Counter) when ?is_horizontal_space(H) ->
  strip_horizontal_space(T, Counter + 1);
strip_horizontal_space(T, Counter) ->
  {T, Counter}.

tokenize_dot(T, Line, Column, DotInfo, Scope, Tokens) ->
  case strip_horizontal_space(T, 0) of
    {[$# | R], _} ->
      case tokenize_comment(R, [$#]) of
        {error, Char} ->
          error_comment(Char, [$# | R], Line, Column, Scope, Tokens);

        {Rest, Comment} ->
          preserve_comments(Line, Column, Tokens, Comment, Rest, Scope),
          tokenize_dot(Rest, Line, 1, DotInfo, Scope, Tokens)
      end;
    {"\r\n" ++ Rest, _} ->
      tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens);
    {"\n" ++ Rest, _} ->
      tokenize_dot(Rest, Line + 1, 1, DotInfo, Scope, Tokens);
    {Rest, Length} ->
      handle_dot([$. | Rest], Line, Column + Length, DotInfo, Scope, Tokens)
  end.

handle_char(0)   -> {"\\0", "null byte"};
handle_char(7)   -> {"\\a", "alert"};
handle_char($\b) -> {"\\b", "backspace"};
handle_char($\d) -> {"\\d", "delete"};
handle_char($\e) -> {"\\e", "escape"};
handle_char($\f) -> {"\\f", "form feed"};
handle_char($\n) -> {"\\n", "newline"};
handle_char($\r) -> {"\\r", "carriage return"};
handle_char($\s) -> {"\\s", "space"};
handle_char($\t) -> {"\\t", "tab"};
handle_char($\v) -> {"\\v", "vertical tab"};
handle_char(_)  -> false.

%% Handlers

handle_heredocs(T, Line, Column, H, Scope, Tokens) ->
  case extract_heredoc_with_interpolation(Line, Column, Scope, true, T, H) of
    {ok, NewLine, NewColumn, Parts, Rest, NewScope} ->
      case unescape_tokens(Parts, Line, Column, NewScope) of
        {ok, Unescaped} ->
          Token = {heredoc_type(H), {Line, Column, nil}, NewColumn - 4, Unescaped},
          tokenize(Rest, NewLine, NewColumn, NewScope, [Token | Tokens]);

        {error, Reason} ->
          error(Reason, Rest, Scope, Tokens)
      end;

    {error, Reason} ->
      error(Reason, [H, H, H] ++ T, Scope, Tokens)
  end.

handle_strings(T, Line, Column, H, Scope, Tokens) ->
  case elixir_interpolation:extract(Line, Column, Scope, true, T, H) of
    {error, Reason} ->
      interpolation_error(Reason, [H | T], Scope, Tokens, " (for string starting at line ~B)", [Line]);

    {NewLine, NewColumn, Parts, [$: | Rest], InterScope} when ?is_space(hd(Rest)) ->
      NewScope = case is_unnecessary_quote(Parts, InterScope) of
        true ->
          WarnMsg = io_lib:format(
            "found quoted keyword \"~ts\" but the quotes are not required. "
            "Note that keywords are always atoms, even when quoted. "
            "Similar to atoms, keywords made exclusively of ASCII "
            "letters, numbers, and underscores and not beginning with a "
            "number do not require quotes",
            [hd(Parts)]
          ),
          prepend_warning(Line, Column, WarnMsg, InterScope);

        false ->
          InterScope
      end,

      case unescape_tokens(Parts, Line, Column, NewScope) of
        {ok, [Part]} when is_binary(Part) ->
          case unsafe_to_atom(Part, Line, Column - 1, Scope) of
            {ok, Atom} ->
              Token = {kw_identifier, {Line, Column - 1, nil}, Atom},
              tokenize(Rest, NewLine, NewColumn + 1, NewScope, [Token | Tokens]);
            {error, Reason} ->
              {error, Reason, Rest, Tokens}
          end;

        {ok, Unescaped} ->
          Key = case Scope#elixir_tokenizer.existing_atoms_only of
            true  -> kw_identifier_safe;
            false -> kw_identifier_unsafe
          end,
          Token = {Key, {Line, Column - 1, nil}, Unescaped},
          tokenize(Rest, NewLine, NewColumn + 1, NewScope, [Token | Tokens]);

        {error, Reason} ->
          error(Reason, Rest, NewScope, Tokens)
      end;

    {NewLine, NewColumn, Parts, Rest, NewScope} ->
      case unescape_tokens(Parts, Line, Column, NewScope) of
        {ok, Unescaped} ->
          Token = {string_type(H), {Line, Column - 1, nil}, Unescaped},
          tokenize(Rest, NewLine, NewColumn, NewScope, [Token | Tokens]);

        {error, Reason} ->
          error(Reason, Rest, NewScope, Tokens)
      end
  end.

handle_unary_op([$: | Rest], Line, Column, _Kind, Length, Op, Scope, Tokens) when ?is_space(hd(Rest)) ->
  Token = {kw_identifier, {Line, Column, nil}, Op},
  tokenize(Rest, Line, Column + Length + 1, Scope, [Token | Tokens]);

handle_unary_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) ->
  case strip_horizontal_space(Rest, 0) of
    {[$/ | _] = Remaining, Extra} ->
      Token = {identifier, {Line, Column, nil}, Op},
      tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens]);
    {Remaining, Extra} ->
      Token = {Kind, {Line, Column, nil}, Op},
      tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens])
  end.

handle_op([$: | Rest], Line, Column, _Kind, Length, Op, Scope, Tokens) when ?is_space(hd(Rest)) ->
  Token = {kw_identifier, {Line, Column, nil}, Op},
  tokenize(Rest, Line, Column + Length + 1, Scope, [Token | Tokens]);

handle_op(Rest, Line, Column, Kind, Length, Op, Scope, Tokens) ->
  case strip_horizontal_space(Rest, 0) of
    {[$/ | _] = Remaining, Extra} ->
      Token = {identifier, {Line, Column, nil}, Op},
      tokenize(Remaining, Line, Column + Length + Extra, Scope, [Token | Tokens]);
    {Remaining, Extra} ->
      NewScope =
        %% TODO: Remove these deprecations on Elixir v2.0
        case Op of
          '^^^' ->
            Msg = "^^^ is deprecated. It is typically used as xor but it has the wrong precedence, use Bitwise.bxor/2 instead",
            prepend_warning(Line, Column, Msg, Scope);

          '~~~' ->
            Msg = "~~~ is deprecated. Use Bitwise.bnot/1 instead for clarity",
            prepend_warning(Line, Column, Msg, Scope);

          '<|>' ->
            Msg = "<|> is deprecated. Use another pipe-like operator",
            prepend_warning(Line, Column, Msg, Scope);

          _ ->
            Scope
        end,

      Token = {Kind, {Line, Column, previous_was_eol(Tokens)}, Op},
      tokenize(Remaining, Line, Column + Length + Extra, NewScope, add_token_with_eol(Token, Tokens))
  end.

% ## Three Token Operators
handle_dot([$., T1, T2, T3 | Rest], Line, Column, DotInfo, Scope, Tokens) when
    ?unary_op3(T1, T2, T3); ?comp_op3(T1, T2, T3); ?and_op3(T1, T2, T3); ?or_op3(T1, T2, T3);
    ?arrow_op3(T1, T2, T3); ?xor_op3(T1, T2, T3); ?concat_op3(T1, T2, T3) ->
  handle_call_identifier(Rest, Line, Column, DotInfo, 3, [T1, T2, T3], Scope, Tokens);

% ## Two Token Operators
handle_dot([$., T1, T2 | Rest], Line, Column, DotInfo, Scope, Tokens) when
    ?comp_op2(T1, T2); ?rel_op2(T1, T2); ?and_op(T1, T2); ?or_op(T1, T2);
    ?arrow_op(T1, T2); ?in_match_op(T1, T2); ?concat_op(T1, T2); ?power_op(T1, T2);
    ?type_op(T1, T2); ?range_op(T1, T2) ->
  handle_call_identifier(Rest, Line, Column, DotInfo, 2, [T1, T2], Scope, Tokens);

% ## Single Token Operators
handle_dot([$., T | Rest], Line, Column, DotInfo, Scope, Tokens) when
    ?at_op(T); ?unary_op(T); ?capture_op(T); ?dual_op(T); ?mult_op(T);
    ?rel_op(T); ?match_op(T); ?pipe_op(T) ->
  handle_call_identifier(Rest, Line, Column, DotInfo, 1, [T], Scope, Tokens);

% ## Exception for .( as it needs to be treated specially in the parser
handle_dot([$., $( | Rest], Line, Column, DotInfo, Scope, Tokens) ->
  TokensSoFar = add_token_with_eol({dot_call_op, DotInfo, '.'}, Tokens),
  tokenize([$( | Rest], Line, Column, Scope, TokensSoFar);

handle_dot([$., H | T] = Original, Line, Column, DotInfo, Scope, Tokens) when ?is_quote(H) ->
  case elixir_interpolation:extract(Line, Column + 1, Scope, true, T, H) of
    {NewLine, NewColumn, [Part], Rest, InterScope} when is_list(Part) ->
      NewScope = case is_unnecessary_quote([Part], InterScope) of
        true ->
          WarnMsg = io_lib:format(
            "found quoted call \"~ts\" but the quotes are not required. "
            "Calls made exclusively of Unicode letters, numbers, and underscores "
            "and not beginning with a number do not require quotes",
            [Part]
          ),
          prepend_warning(Line, Column, WarnMsg, InterScope);

        false ->
          InterScope
      end,

      case unsafe_to_atom(Part, Line, Column, NewScope) of
        {ok, Atom} ->
          Token = check_call_identifier(Line, Column, Part, Atom, Rest),
          TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens),
          tokenize(Rest, NewLine, NewColumn, NewScope, [Token | TokensSoFar]);

        {error, Reason} ->
          error(Reason, Original, NewScope, Tokens)
      end;
    {_NewLine, _NewColumn, _Parts, Rest, NewScope} ->
      Message = "interpolation is not allowed when calling function/macro. Found interpolation in a call starting with: ",
      error({Line, Column, Message, [H]}, Rest, NewScope, Tokens);
    {error, Reason} ->
      interpolation_error(Reason, Original, Scope, Tokens, " (for function name starting at line ~B)", [Line])
  end;

handle_dot([$. | Rest], Line, Column, DotInfo, Scope, Tokens) ->
  TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens),
  tokenize(Rest, Line, Column, Scope, TokensSoFar).

handle_call_identifier(Rest, Line, Column, DotInfo, Length, UnencodedOp, Scope, Tokens) ->
  Token = check_call_identifier(Line, Column, UnencodedOp, list_to_atom(UnencodedOp), Rest),
  TokensSoFar = add_token_with_eol({'.', DotInfo}, Tokens),
  tokenize(Rest, Line, Column + Length, Scope, [Token | TokensSoFar]).

% ## Ambiguous unary/binary operators tokens
handle_space_sensitive_tokens([Sign, NotMarker | T], Line, Column, Scope, [{Identifier, _, _} = H | Tokens]) when
    ?dual_op(Sign),
    not(?is_space(NotMarker)),
    NotMarker =/= $(, NotMarker =/= $[, NotMarker =/= $<, NotMarker =/= ${,                   %% containers
    NotMarker =/= $%, NotMarker =/= $+, NotMarker =/= $-, NotMarker =/= $/, NotMarker =/= $>, %% operators
    NotMarker =/= $:, %% keywords
    Identifier == identifier ->
  Rest = [NotMarker | T],
  DualOpToken = {dual_op, {Line, Column, nil}, list_to_atom([Sign])},
  tokenize(Rest, Line, Column + 1, Scope, [DualOpToken, setelement(1, H, op_identifier) | Tokens]);

handle_space_sensitive_tokens([], Line, Column,
                              #elixir_tokenizer{cursor_completion=Cursor} = Scope,
                              [{identifier, Info, Identifier} | Tokens]) when Cursor /= false ->
  tokenize([$(], Line, Column+1, Scope, [{paren_identifier, Info, Identifier} | Tokens]);

handle_space_sensitive_tokens(String, Line, Column, Scope, Tokens) ->
  tokenize(String, Line, Column, Scope, Tokens).

%% Helpers

eol(_Line, _Column, [{',', {Line, Column, Count}} | Tokens]) ->
  [{',', {Line, Column, Count + 1}} | Tokens];
eol(_Line, _Column, [{';', {Line, Column, Count}} | Tokens]) ->
  [{';', {Line, Column, Count + 1}} | Tokens];
eol(_Line, _Column, [{eol, {Line, Column, Count}} | Tokens]) ->
  [{eol, {Line, Column, Count + 1}} | Tokens];
eol(Line, Column, Tokens) ->
  [{eol, {Line, Column, 1}} | Tokens].

is_unnecessary_quote([Part], #elixir_tokenizer{warn_on_unnecessary_quotes=true} = Scope) when is_list(Part) ->
  case (Scope#elixir_tokenizer.identifier_tokenizer):tokenize(Part) of
    {identifier, _, [], _, true, Special} -> not lists:member(at, Special);
    _ -> false
  end;

is_unnecessary_quote(_Parts, _Scope) ->
  false.

unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{}) when
    is_binary(Part) andalso byte_size(Part) > 255;
    is_list(Part) andalso length(Part) > 255 ->
  {error, {Line, Column, "atom length must be less than system limit: ", elixir_utils:characters_to_list(Part)}};
unsafe_to_atom(Part, Line, Column, #elixir_tokenizer{static_atoms_encoder=StaticAtomsEncoder}) when
    is_function(StaticAtomsEncoder) ->
  Value = elixir_utils:characters_to_binary(Part),
  case StaticAtomsEncoder(Value, [{line, Line}, {column, Column}]) of
    {ok, Term} ->
      {ok, Term};
    {error, Reason} when is_binary(Reason) ->
      {error, {Line, Column, elixir_utils:characters_to_list(Reason) ++ ": ", elixir_utils:characters_to_list(Part)}}
  end;
unsafe_to_atom(Binary, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_binary(Binary) ->
  try
    {ok, binary_to_existing_atom(Binary, utf8)}
  catch
    error:badarg -> {error, {Line, Column, "unsafe atom does not exist: ", elixir_utils:characters_to_list(Binary)}}
  end;
unsafe_to_atom(Binary, _Line, _Column, #elixir_tokenizer{}) when is_binary(Binary) ->
  {ok, binary_to_atom(Binary, utf8)};
unsafe_to_atom(List, Line, Column, #elixir_tokenizer{existing_atoms_only=true}) when is_list(List) ->
  try
    {ok, list_to_existing_atom(List)}
  catch
    error:badarg -> {error, {Line, Column, "unsafe atom does not exist: ", List}}
  end;
unsafe_to_atom(List, _Line, _Column, #elixir_tokenizer{}) when is_list(List) ->
  {ok, list_to_atom(List)}.

collect_modifiers([H | T], Buffer) when ?is_downcase(H) or ?is_upcase(H) or ?is_digit(H) ->
  collect_modifiers(T, [H | Buffer]);

collect_modifiers(Rest, Buffer) ->
  {Rest, lists:reverse(Buffer)}.

%% Heredocs

extract_heredoc_with_interpolation(Line, Column, Scope, Interpol, T, H) ->
  case extract_heredoc_header(T) of
    {ok, Headerless} ->
      %% We prepend a new line so we can transparently remove
      %% spaces later. This new line is removed by calling "tl"
      %% in the final heredoc body three lines below.
      case elixir_interpolation:extract(Line, Column, Scope, Interpol, [$\n|Headerless], [H,H,H]) of
        {NewLine, NewColumn, Parts0, Rest, InterScope} ->
          Indent = NewColumn - 4,
          Fun = fun(Part, Acc) -> extract_heredoc_indent(Part, Acc, Indent) end,
          {Parts1, {ShouldWarn, _}} = lists:mapfoldl(Fun, {false, Line}, Parts0),
          Parts2 = extract_heredoc_head(Parts1),
          NewScope = maybe_heredoc_warn(ShouldWarn, Column, InterScope, H),
          {ok, NewLine, NewColumn, tokens_to_binary(Parts2), Rest, NewScope};

        {error, Reason} ->
          {error, interpolation_format(Reason, " (for heredoc starting at line ~B)", [Line])}
      end;

    error ->
      Message = "heredoc allows only zero or more whitespace characters followed by a new line after ",
      {error, {Line, Column, io_lib:format(Message, []), [H, H, H]}}
  end.

extract_heredoc_header("\r\n" ++ Rest) ->
  {ok, Rest};
extract_heredoc_header("\n" ++ Rest) ->
  {ok, Rest};
extract_heredoc_header([H | T]) when ?is_horizontal_space(H) ->
  extract_heredoc_header(T);
extract_heredoc_header(_) ->
  error.

extract_heredoc_indent(Part, {Warned, Line}, Indent) when is_list(Part) ->
  extract_heredoc_indent(Part, [], Warned, Line, Indent);
extract_heredoc_indent({_, {EndLine, _, _}, _} = Part, {Warned, _Line}, _Indent) ->
  {Part, {Warned, EndLine}}.

extract_heredoc_indent([$\n | Rest], Acc, Warned, Line, Indent) ->
  {Trimmed, ShouldWarn} = trim_space(Rest, Indent),
  Warn = if ShouldWarn, not Warned -> Line + 1; true -> Warned end,
  extract_heredoc_indent(Trimmed, [$\n | Acc], Warn, Line + 1, Indent);
extract_heredoc_indent([Head | Rest], Acc, Warned, Line, Indent) ->
  extract_heredoc_indent(Rest, [Head | Acc], Warned, Line, Indent);
extract_heredoc_indent([], Acc, Warned, Line, _Indent) ->
  {lists:reverse(Acc), {Warned, Line}}.

trim_space(Rest, 0) -> {Rest, false};
trim_space([$\r, $\n | _] = Rest, _) -> {Rest, false};
trim_space([$\n | _] = Rest, _) -> {Rest, false};
trim_space([H | T], Spaces) when ?is_horizontal_space(H) -> trim_space(T, Spaces - 1);
trim_space([], _Spaces) -> {[], false};
trim_space(Rest, _Spaces) -> {Rest, true}.

maybe_heredoc_warn(false, _Column, Scope, _Marker) ->
  Scope;
maybe_heredoc_warn(Line, Column, Scope, Marker) ->
  Msg = io_lib:format("outdented heredoc line. The contents inside the heredoc should be indented "
                      "at the same level as the closing ~ts. The following is forbidden:~n~n"
                      "    def text do~n"
                      "      \"\"\"~n"
                      "    contents~n"
                      "      \"\"\"~n"
                      "    end~n~n"
                      "Instead make sure the contents are indented as much as the heredoc closing:~n~n"
                      "    def text do~n"
                      "      \"\"\"~n"
                      "      contents~n"
                      "      \"\"\"~n"
                      "    end~n~n"
                      "The current heredoc line is indented too little", [[Marker, Marker, Marker]]),

  prepend_warning(Line, Column, Msg, Scope).

extract_heredoc_head([[$\n|H]|T]) -> [H|T].

unescape_tokens(Tokens, Line, Column, #elixir_tokenizer{unescape=true}) ->
  case elixir_interpolation:unescape_tokens(Tokens) of
    {ok, Result} ->
      {ok, Result};

    {error, Message, Token} ->
      {error, {Line, Column, Message ++ ". Syntax error after: ", Token}}
  end;
unescape_tokens(Tokens, _Line, _Column, #elixir_tokenizer{unescape=false}) ->
  {ok, tokens_to_binary(Tokens)}.

tokens_to_binary(Tokens) ->
  [if is_list(Token) -> elixir_utils:characters_to_binary(Token); true -> Token end
   || Token <- Tokens].

%% Integers and floats
%% At this point, we are at least sure the first digit is a number.

%% Check if we have a point followed by a number;
tokenize_number([$., H | T], Acc, Length, false) when ?is_digit(H) ->
  tokenize_number(T, [H, $. | Acc], Length + 2, true);

%% Check if we have an underscore followed by a number;
tokenize_number([$_, H | T], Acc, Length, Bool) when ?is_digit(H) ->
  tokenize_number(T, [H, $_ | Acc], Length + 2, Bool);

%% Check if we have e- followed by numbers (valid only for floats);
tokenize_number([E, S, H | T], Acc, Length, true)
    when (E =:= $E) or (E =:= $e), ?is_digit(H), S =:= $+ orelse S =:= $- ->
  tokenize_number(T, [H, S, E | Acc], Length + 3, true);

%% Check if we have e followed by numbers (valid only for floats);
tokenize_number([E, H | T], Acc, Length, true)
    when (E =:= $E) or (E =:= $e), ?is_digit(H) ->
  tokenize_number(T, [H, E | Acc], Length + 2, true);

%% Finally just numbers.
tokenize_number([H | T], Acc, Length, Bool) when ?is_digit(H) ->
  tokenize_number(T, [H | Acc], Length + 1, Bool);

%% Cast to float...
tokenize_number(Rest, Acc, Length, true) ->
  try
    {Number, Original} = reverse_number(Acc, [], []),
    {Rest, list_to_float(Number), Original, Length}
  catch
    error:badarg -> {error, "invalid float number ", lists:reverse(Acc)}
  end;

%% Or integer.
tokenize_number(Rest, Acc, Length, false) ->
  {Number, Original} = reverse_number(Acc, [], []),
  {Rest, list_to_integer(Number), Original, Length}.

tokenize_hex([H | T], Acc, Length) when ?is_hex(H) ->
  tokenize_hex(T, [H | Acc], Length + 1);
tokenize_hex([$_, H | T], Acc, Length) when ?is_hex(H) ->
  tokenize_hex(T, [H, $_ | Acc], Length + 2);
tokenize_hex(Rest, Acc, Length) ->
  {Number, Original} = reverse_number(Acc, [], []),
  {Rest, list_to_integer(Number, 16), [$0, $x | Original], Length}.

tokenize_octal([H | T], Acc, Length) when ?is_octal(H) ->
  tokenize_octal(T, [H | Acc], Length + 1);
tokenize_octal([$_, H | T], Acc, Length) when ?is_octal(H) ->
  tokenize_octal(T, [H, $_ | Acc], Length + 2);
tokenize_octal(Rest, Acc, Length) ->
  {Number, Original} = reverse_number(Acc, [], []),
  {Rest, list_to_integer(Number, 8), [$0, $o | Original], Length}.

tokenize_bin([H | T], Acc, Length) when ?is_bin(H) ->
  tokenize_bin(T, [H | Acc], Length + 1);
tokenize_bin([$_, H | T], Acc, Length) when ?is_bin(H) ->
  tokenize_bin(T, [H, $_ | Acc], Length + 2);
tokenize_bin(Rest, Acc, Length) ->
  {Number, Original} = reverse_number(Acc, [], []),
  {Rest, list_to_integer(Number, 2), [$0, $b | Original], Length}.

reverse_number([$_ | T], Number, Original) ->
  reverse_number(T, Number, [$_ | Original]);
reverse_number([H | T], Number, Original) ->
  reverse_number(T, [H | Number], [H | Original]);
reverse_number([], Number, Original) ->
  {Number, Original}.

%% Comments

reset_eol([{eol, {Line, Column, _}} | Rest]) -> [{eol, {Line, Column, 0}} | Rest];
reset_eol(Rest) -> Rest.

tokenize_comment("\r\n" ++ _ = Rest, Acc) ->
  {Rest, lists:reverse(Acc)};
tokenize_comment("\n" ++ _ = Rest, Acc) ->
  {Rest, lists:reverse(Acc)};
tokenize_comment([H | _Rest], _) when ?bidi(H) ->
  {error, H};
tokenize_comment([H | Rest], Acc) ->
  tokenize_comment(Rest, [H | Acc]);
tokenize_comment([], Acc) ->
  {[], lists:reverse(Acc)}.

error_comment(H, Comment, Line, Column, Scope, Tokens) ->
  Token = io_lib:format("\\u~4.16.0B", [H]),
  Reason = {Line, Column, "invalid bidirectional formatting character in comment: ", Token},
  error(Reason, Comment, Scope, Tokens).

preserve_comments(Line, Column, Tokens, Comment, Rest, Scope) ->
  case Scope#elixir_tokenizer.preserve_comments of
    Fun when is_function(Fun) ->
      Fun(Line, Column, Tokens, Comment, Rest);
    nil ->
      ok
  end.

%% Identifiers

tokenize([H | T]) when ?is_upcase(H) ->
  {Acc, Rest, Length, Special} = tokenize_continue(T, [H], 1, []),
  {alias, lists:reverse(Acc), Rest, Length, true, Special};
tokenize([H | T]) when ?is_downcase(H); H =:= $_ ->
  {Acc, Rest, Length, Special} = tokenize_continue(T, [H], 1, []),
  {identifier, lists:reverse(Acc), Rest, Length, true, Special};
tokenize(_List) ->
  {error, empty}.

tokenize_continue([$@ | T], Acc, Length, Special) ->
  tokenize_continue(T, [$@ | Acc], Length + 1, [at | lists:delete(at, Special)]);
tokenize_continue([$! | T], Acc, Length, Special) ->
  {[$! | Acc], T, Length + 1, [punctuation | Special]};
tokenize_continue([$? | T], Acc, Length, Special) ->
  {[$? | Acc], T, Length + 1, [punctuation | Special]};
tokenize_continue([H | T], Acc, Length, Special) when ?is_upcase(H); ?is_downcase(H); ?is_digit(H); H =:= $_ ->
  tokenize_continue(T, [H | Acc], Length + 1, Special);
tokenize_continue(Rest, Acc, Length, Special) ->
  {Acc, Rest, Length, Special}.

tokenize_identifier(String, Line, Column, Scope, MaybeKeyword) ->
  case (Scope#elixir_tokenizer.identifier_tokenizer):tokenize(String) of
    {Kind, Acc, Rest, Length, Ascii, Special} ->
      Keyword = MaybeKeyword andalso maybe_keyword(Rest),

      case keyword_or_unsafe_to_atom(Keyword, Acc, Line, Column, Scope) of
        {keyword, Atom, Type} ->
          {keyword, Atom, Type, Rest, Length};
        {ok, Atom} ->
          {Kind, Acc, Atom, Rest, Length, Ascii, Special};
        {error, _Reason} = Error ->
          Error
      end;

    {error, {not_highly_restrictive, Wrong, {Prefix, Suffix}}} ->
      WrongColumn = Column + length(Wrong) - 1,
      case suggest_simpler_unexpected_token_in_error(Wrong, Line, WrongColumn, Scope) of
        no_suggestion ->
          %% we append a pointer to more info if we aren't appending a suggestion
          MoreInfo = "\nSee https://hexdocs.pm/elixir/unicode-syntax.html for more information.",
          {error, {Line, Column, {Prefix, Suffix ++ MoreInfo}, Wrong}};

        {_, {Line, WrongColumn, _, SuggestionMessage}} = _SuggestionError ->
          {error, {Line, WrongColumn, {Prefix, Suffix ++ SuggestionMessage}, Wrong}}
      end;

    {error, {unexpected_token, Wrong}} ->
      WrongColumn = Column + length(Wrong) - 1,
      case suggest_simpler_unexpected_token_in_error(Wrong, Line, WrongColumn, Scope) of
        no_suggestion ->
          [T | _] = lists:reverse(Wrong),
          case suggest_simpler_unexpected_token_in_error([T], Line, WrongColumn, Scope) of
            no_suggestion -> {unexpected_token, length(Wrong)};
            SuggestionError -> SuggestionError
          end;

        SuggestionError ->
          SuggestionError
      end;

    {error, empty} ->
      empty
  end.

%% heuristic: try nfkc; try confusability skeleton; try calling this again w/just failed codepoint
suggest_simpler_unexpected_token_in_error(Wrong, Line, WrongColumn, Scope) ->
  NFKC = unicode:characters_to_nfkc_list(Wrong),
  case (Scope#elixir_tokenizer.identifier_tokenizer):tokenize(NFKC) of
    {error, _Reason} ->
       ConfusableSkeleton = 'Elixir.String.Tokenizer.Security':confusable_skeleton(Wrong),
       case (Scope#elixir_tokenizer.identifier_tokenizer):tokenize(ConfusableSkeleton) of
         {_, Simpler, _, _, _, _} ->
           Message = suggest_change("Codepoint failed identifier tokenization, but a simpler form was found.",
                                    Wrong,
                                    "You could write the above in a similar way that is accepted by Elixir:",
                                    Simpler,
                                    "See https://hexdocs.pm/elixir/unicode-syntax.html for more information."),
           {error, {Line, WrongColumn, "unexpected token: ", Message}};
         _other ->
           no_suggestion
       end;
    {_, _NFKC, _, _, _, _} ->
      Message = suggest_change("Elixir expects unquoted Unicode atoms, variables, and calls to use allowed codepoints and to be in NFC form.",
                               Wrong,
                               "You could write the above in a compatible format that is accepted by Elixir:",
                               NFKC,
                               "See https://hexdocs.pm/elixir/unicode-syntax.html for more information."),
          {error, {Line, WrongColumn, "unexpected token: ", Message}}
    end.

suggest_change(Intro, WrongForm, Hint, HintedForm, Ending) ->
  WrongCodepoints = list_to_codepoint_hex(WrongForm),
  HintedCodepoints = list_to_codepoint_hex(HintedForm),
  io_lib:format("~ts\n\nGot:\n\n    \"~ts\" (code points~ts)\n\n"
                "Hint: ~ts\n\n    \"~ts\" (code points~ts)\n\n~ts",
                [Intro, WrongForm, WrongCodepoints, Hint, HintedForm, HintedCodepoints, Ending]).

maybe_keyword([]) -> true;
maybe_keyword([$:, $: | _]) -> true;
maybe_keyword([$: | _]) -> false;
maybe_keyword(_) -> true.

list_to_codepoint_hex(List) ->
  [io_lib:format(" 0x~5.16.0B", [Codepoint]) || Codepoint <- List].

tokenize_alias(Rest, Line, Column, Unencoded, Atom, Length, Ascii, Special, Scope, Tokens) ->
  if
    not Ascii or (Special /= []) ->
      Invalid = hd([C || C <- Unencoded, (C < $A) or (C > 127)]),
      Reason = {Line, Column, invalid_character_error("alias (only ASCII characters, without punctuation, are allowed)", Invalid), Unencoded},
      error(Reason, Unencoded ++ Rest, Scope, Tokens);

    true ->
      AliasesToken = {alias, {Line, Column, Unencoded}, Atom},
      tokenize(Rest, Line, Column + Length, Scope, [AliasesToken | Tokens])
  end.

%% Check if it is a call identifier (paren | bracket | do)

check_call_identifier(Line, Column, Unencoded, Atom, [$( | _]) ->
  {paren_identifier, {Line, Column, Unencoded}, Atom};
check_call_identifier(Line, Column, Unencoded, Atom, [$[ | _]) ->
  {bracket_identifier, {Line, Column, Unencoded}, Atom};
check_call_identifier(Line, Column, Unencoded, Atom, _Rest) ->
  {identifier, {Line, Column, Unencoded}, Atom}.

add_token_with_eol({unary_op, _, _} = Left, T) -> [Left | T];
add_token_with_eol(Left, [{eol, _} | T]) -> [Left | T];
add_token_with_eol(Left, T) -> [Left | T].

previous_was_eol([{',', {_, _, Count}} | _]) when Count > 0 -> Count;
previous_was_eol([{';', {_, _, Count}} | _]) when Count > 0 -> Count;
previous_was_eol([{eol, {_, _, Count}} | _]) when Count > 0 -> Count;
previous_was_eol(_) -> nil.

%% Error handling

interpolation_error(Reason, Rest, Scope, Tokens, Extension, Args) ->
  error(interpolation_format(Reason, Extension, Args), Rest, Scope, Tokens).

interpolation_format({string, Line, Column, Message, Token}, Extension, Args) ->
  {Line, Column, [Message, io_lib:format(Extension, Args)], Token};
interpolation_format({_, _, _, _} = Reason, _Extension, _Args) ->
  Reason.

%% Terminators

handle_terminator(Rest, _, _, Scope, {'(', {Line, Column, _}}, [{alias, _, Alias} | Tokens]) ->
  Reason =
    io_lib:format(
      "unexpected ( after alias ~ts. Function names and identifiers in Elixir "
      "start with lowercase characters or underscore. For example:\n\n"
      "    hello_world()\n"
      "    _starting_with_underscore()\n"
      "    numb3rs_are_allowed()\n"
      "    may_finish_with_question_mark?()\n"
      "    may_finish_with_exclamation_mark!()\n\n"
      "Unexpected token: ",
      [Alias]
    ),

  error({Line, Column, Reason, ["("]}, atom_to_list(Alias) ++ [$( | Rest], Scope, Tokens);
handle_terminator(Rest, Line, Column, #elixir_tokenizer{terminators=none} = Scope, Token, Tokens) ->
  tokenize(Rest, Line, Column, Scope, [Token | Tokens]);
handle_terminator(Rest, Line, Column, Scope, Token, Tokens) ->
  #elixir_tokenizer{terminators=Terminators} = Scope,

  case check_terminator(Token, Terminators, Scope) of
    {error, Reason} ->
      error(Reason, atom_to_list(element(1, Token)) ++ Rest, Scope, Tokens);
    {ok, New} ->
      tokenize(Rest, Line, Column, New, [Token | Tokens])
  end.

check_terminator({Start, {Line, _, _}}, Terminators, Scope)
    when Start == '('; Start == '['; Start == '{'; Start == '<<' ->
  Indentation = Scope#elixir_tokenizer.indentation,
  {ok, Scope#elixir_tokenizer{terminators=[{Start, Line, Indentation} | Terminators]}};

check_terminator({Start, {Line, _, _}}, Terminators, Scope) when Start == 'fn'; Start == 'do' ->
  Indentation = Scope#elixir_tokenizer.indentation,

  NewScope =
    case Terminators of
      %% If the do is indented equally or less than the previous do, it may be a missing end error!
      [{Start, _, PreviousIndentation} = Previous | _] when Indentation =< PreviousIndentation ->
        Scope#elixir_tokenizer{mismatch_hints=[Previous | Scope#elixir_tokenizer.mismatch_hints]};

      _ ->
        Scope
    end,

  {ok, NewScope#elixir_tokenizer{terminators=[{Start, Line, Indentation} | Terminators]}};

check_terminator({'end', {EndLine, _, _}}, [{'do', _, Indentation} | Terminators], Scope) ->
  NewScope =
    %% If the end is more indented than the do, it may be a missing do error!
    case Scope#elixir_tokenizer.indentation > Indentation of
      true ->
        Hint = {'end', EndLine, Scope#elixir_tokenizer.indentation},
        Scope#elixir_tokenizer{mismatch_hints=[Hint | Scope#elixir_tokenizer.mismatch_hints]};

      false ->
        Scope
    end,

  {ok, NewScope#elixir_tokenizer{terminators=Terminators}};

check_terminator({End, {EndLine, EndColumn, _}}, [{Start, StartLine, _} | Terminators], Scope)
    when End == 'end'; End == ')'; End == ']'; End == '}'; End == '>>' ->
  case terminator(Start) of
    End ->
      {ok, Scope#elixir_tokenizer{terminators=Terminators}};

    ExpectedEnd ->
      Suffix =
        io_lib:format(
          "\n\n    HINT: the \"~ts\" on line ~B is missing terminator \"~ts\"\n",
          [Start, StartLine, ExpectedEnd]
        ),
      {error, {EndLine, EndColumn, {unexpected_token_or_reserved(End), Suffix}, [atom_to_list(End)]}}
  end;

check_terminator({'end', {Line, Column, _}}, [], #elixir_tokenizer{mismatch_hints=Hints}) ->
  Suffix =
    case lists:keyfind('end', 1, Hints) of
      {'end', HintLine, _Identation} ->
        io_lib:format("\n\n    HINT: the \"end\" on line ~B may not have a matching \"do\" "
                      "defined before it (based on indentation)\n", [HintLine]);
      false ->
        ""
    end,

  {error, {Line, Column, {"unexpected reserved word: ", Suffix}, "end"}};

check_terminator({End, {Line, Column, _}}, [], _Scope)
    when End == ')'; End == ']'; End == '}'; End == '>>' ->
  {error, {Line, Column, "unexpected token: ", atom_to_list(End)}};

check_terminator(_, _, Scope) ->
  {ok, Scope}.

unexpected_token_or_reserved('end') -> "unexpected reserved word: ";
unexpected_token_or_reserved(_) -> "unexpected token: ".

missing_terminator_hint(Start, End, #elixir_tokenizer{mismatch_hints=Hints}) ->
  case lists:keyfind(Start, 1, Hints) of
    {Start, HintLine, _} ->
      io_lib:format("\n\n    HINT: it looks like the \"~ts\" on line ~B does not have a matching \"~ts\"\n",
                    [Start, HintLine, End]);
    false ->
      ""
  end.

string_type($") -> bin_string;
string_type($') -> list_string.

heredoc_type($") -> bin_heredoc;
heredoc_type($') -> list_heredoc.

sigil_terminator($() -> $);
sigil_terminator($[) -> $];
sigil_terminator(${) -> $};
sigil_terminator($<) -> $>;
sigil_terminator(O) -> O.

terminator('fn') -> 'end';
terminator('do') -> 'end';
terminator('(')  -> ')';
terminator('[')  -> ']';
terminator('{')  -> '}';
terminator('<<') -> '>>'.

%% Keywords checking

keyword_or_unsafe_to_atom(true, "fn", _Line, _Column, _Scope) -> {keyword, 'fn', terminator};
keyword_or_unsafe_to_atom(true, "do", _Line, _Column, _Scope) -> {keyword, 'do', terminator};
keyword_or_unsafe_to_atom(true, "end", _Line, _Column, _Scope) -> {keyword, 'end', terminator};
keyword_or_unsafe_to_atom(true, "true", _Line, _Column, _Scope) -> {keyword, 'true', token};
keyword_or_unsafe_to_atom(true, "false", _Line, _Column, _Scope) -> {keyword, 'false', token};
keyword_or_unsafe_to_atom(true, "nil", _Line, _Column, _Scope) -> {keyword, 'nil', token};

keyword_or_unsafe_to_atom(true, "not", _Line, _Column, _Scope) -> {keyword, 'not', unary_op};
keyword_or_unsafe_to_atom(true, "and", _Line, _Column, _Scope) -> {keyword, 'and', and_op};
keyword_or_unsafe_to_atom(true, "or", _Line, _Column, _Scope) -> {keyword, 'or', or_op};
keyword_or_unsafe_to_atom(true, "when", _Line, _Column, _Scope) -> {keyword, 'when', when_op};
keyword_or_unsafe_to_atom(true, "in", _Line, _Column, _Scope) -> {keyword, 'in', in_op};

keyword_or_unsafe_to_atom(true, "after", _Line, _Column, _Scope) -> {keyword, 'after', block};
keyword_or_unsafe_to_atom(true, "else", _Line, _Column, _Scope) -> {keyword, 'else', block};
keyword_or_unsafe_to_atom(true, "catch", _Line, _Column, _Scope) -> {keyword, 'catch', block};
keyword_or_unsafe_to_atom(true, "rescue", _Line, _Column, _Scope) -> {keyword, 'rescue', block};

keyword_or_unsafe_to_atom(_, Part, Line, Column, Scope) ->
  unsafe_to_atom(Part, Line, Column, Scope).

tokenize_keyword(terminator, Rest, Line, Column, Atom, Length, Scope, Tokens) ->
  case tokenize_keyword_terminator(Line, Column, Atom, Tokens) of
    {ok, [Check | T]} ->
      handle_terminator(Rest, Line, Column + Length, Scope, Check, T);
    {error, Message, Token} ->
      error({Line, Column, Message, Token}, Token ++ Rest, Scope, Tokens)
  end;

tokenize_keyword(token, Rest, Line, Column, Atom, Length, Scope, Tokens) ->
  Token = {Atom, {Line, Column, nil}},
  tokenize(Rest, Line, Column + Length, Scope, [Token | Tokens]);

tokenize_keyword(block, Rest, Line, Column, Atom, Length, Scope, Tokens) ->
  Token = {block_identifier, {Line, Column, nil}, Atom},
  tokenize(Rest, Line, Column + Length, Scope, [Token | Tokens]);

tokenize_keyword(Kind, Rest, Line, Column, Atom, Length, Scope, Tokens) ->
  NewTokens =
    case strip_horizontal_space(Rest, 0) of
      {[$/ | _], _} ->
        [{identifier, {Line, Column, nil}, Atom} | Tokens];

      _ ->
        case {Kind, Tokens} of
          {in_op, [{unary_op, NotInfo, 'not'} | T]} ->
            add_token_with_eol({in_op, NotInfo, 'not in'}, T);

          {_, _} ->
            add_token_with_eol({Kind, {Line, Column, previous_was_eol(Tokens)}, Atom}, Tokens)
        end
    end,

  tokenize(Rest, Line, Column + Length, Scope, NewTokens).

tokenize_sigil([$~ | T], Line, Column, Scope, Tokens) ->
  case tokenize_sigil_name(T, [], Line, Column + 1, Scope, Tokens) of
    {ok, Name, Rest, NewLine, NewColumn, NewScope, NewTokens} ->
      tokenize_sigil_contents(Rest, Name, NewLine, NewColumn, NewScope, NewTokens);
    
    {error, Message, Token} ->
      Reason = {Line, Column, Message, Token},
      error(Reason, T, Scope, Tokens)
  end.

% A one-letter sigil is ok both as upcase as well as downcase.
tokenize_sigil_name([S | T], [], Line, Column, Scope, Tokens) when ?is_upcase(S) orelse ?is_downcase(S) ->
  tokenize_sigil_name(T, [S], Line, Column + 1, Scope, Tokens);
% If we have an uppercase letter, we keep tokenizing the name.
tokenize_sigil_name([S | T], NameAcc, Line, Column, Scope, Tokens) when ?is_upcase(S) ->
  tokenize_sigil_name(T, [S | NameAcc], Line, Column + 1, Scope, Tokens);
% With a lowercase letter and a non-empty NameAcc we return an error.
tokenize_sigil_name([S | _T] = Original, [_ | _] = NameAcc, _Line, _Column, _Scope, _Tokens) when ?is_downcase(S) ->
  Message = "invalid sigil name, it should be either a one-letter lowercase letter or a" ++ 
            " sequence of uppercase letters only, got: ",
  {error, Message, [$~] ++ lists:reverse(NameAcc) ++ Original};
% We finished the letters, so the name is over.
tokenize_sigil_name(T, NameAcc, Line, Column, Scope, Tokens) ->
  {ok, lists:reverse(NameAcc), T, Line, Column, Scope, Tokens}.

tokenize_sigil_contents([H, H, H | T] = Original, [S | _] = SigilName, Line, Column, Scope, Tokens)
    when ?is_quote(H) ->
  case extract_heredoc_with_interpolation(Line, Column, Scope, ?is_downcase(S), T, H) of
    {ok, NewLine, NewColumn, Parts, Rest, NewScope} ->
      {Final, Modifiers} = collect_modifiers(Rest, []),
      Indentation = NewColumn - 4,
      TokenColumn = Column - 1 - length(SigilName),
      Token = {sigil, {Line, TokenColumn, nil}, SigilName, Parts, Modifiers, Indentation, <<H, H, H>>},
      NewColumnWithModifiers = NewColumn + length(Modifiers),
      tokenize(Final, NewLine, NewColumnWithModifiers, NewScope, [Token | Tokens]);

    {error, Reason} ->
      error(Reason, [$~] ++ SigilName ++ Original, Scope, Tokens)
  end;

tokenize_sigil_contents([H | T] = Original, [S | _] = SigilName, Line, Column, Scope, Tokens)
    when ?is_sigil(H) ->
  case elixir_interpolation:extract(Line, Column + 1, Scope, ?is_downcase(S), T, sigil_terminator(H)) of
    {NewLine, NewColumn, Parts, Rest, NewScope} ->
      {Final, Modifiers} = collect_modifiers(Rest, []),
      Indentation = nil,
      TokenColumn = Column - 1 - length(SigilName),
      Token = {sigil, {Line, TokenColumn, nil}, SigilName, tokens_to_binary(Parts), Modifiers, Indentation, <<H>>},
      NewColumnWithModifiers = NewColumn + length(Modifiers),
      tokenize(Final, NewLine, NewColumnWithModifiers, NewScope, [Token | Tokens]);

    {error, Reason} ->
      Sigil = [$~, S, H],
      Message = " (for sigil ~ts starting at line ~B)",
      interpolation_error(Reason, [$~] ++ SigilName ++ Original, Scope, Tokens, Message, [Sigil, Line])
  end;

tokenize_sigil_contents([H | _] = Original, SigilName, Line, Column, Scope, Tokens) ->
  MessageString =
    "\"~ts\" (column ~p, code point U+~4.16.0B). The available delimiters are: "
    "//, ||, \"\", '', (), [], {}, <>",
  Message = io_lib:format(MessageString, [[H], Column, H]),
  ErrorColumn = Column - 1 - length(SigilName),
  error({Line, ErrorColumn, "invalid sigil delimiter: ", Message}, [$~] ++ SigilName ++ Original, Scope, Tokens);

% Incomplete sigil.
tokenize_sigil_contents([], _SigilName, Line, Column, Scope, Tokens) ->
  tokenize([], Line, Column, Scope, Tokens).

%% Fail early on invalid do syntax. For example, after
%% most keywords, after comma and so on.
tokenize_keyword_terminator(DoLine, DoColumn, do, [{identifier, {Line, Column, Meta}, Atom} | T]) ->
  {ok, add_token_with_eol({do, {DoLine, DoColumn, nil}},
                          [{do_identifier, {Line, Column, Meta}, Atom} | T])};
tokenize_keyword_terminator(_Line, _Column, do, [{'fn', _} | _]) ->
  {error, invalid_do_with_fn_error("unexpected reserved word: "), "do"};
tokenize_keyword_terminator(Line, Column, do, Tokens) ->
  case is_valid_do(Tokens) of
    true  -> {ok, add_token_with_eol({do, {Line, Column, nil}}, Tokens)};
    false -> {error, invalid_do_error("unexpected reserved word: "), "do"}
  end;
tokenize_keyword_terminator(Line, Column, Atom, Tokens) ->
  {ok, [{Atom, {Line, Column, nil}} | Tokens]}.

is_valid_do([{Atom, _} | _]) ->
  case Atom of
    ','      -> false;
    ';'      -> false;
    'not'    -> false;
    'and'    -> false;
    'or'     -> false;
    'when'   -> false;
    'in'     -> false;
    'after'  -> false;
    'else'   -> false;
    'catch'  -> false;
    'rescue' -> false;
    _        -> true
  end;
is_valid_do(_) ->
  true.

invalid_character_error(What, Char) ->
  io_lib:format("invalid character \"~ts\" (code point U+~4.16.0B) in ~ts: ", [[Char], Char, What]).

invalid_do_error(Prefix) ->
  {Prefix, ". In case you wanted to write a \"do\" expression, "
  "you must either use do-blocks or separate the keyword argument with comma. "
  "For example, you should either write:\n\n"
  "    if some_condition? do\n"
  "      :this\n"
  "    else\n"
  "      :that\n"
  "    end\n\n"
  "or the equivalent construct:\n\n"
  "    if(some_condition?, do: :this, else: :that)\n\n"
  "where \"some_condition?\" is the first argument and the second argument is a keyword list.\n\n"
  "You may see this error if you forget a trailing comma before the \"do\" in a \"do\" block"}.

invalid_do_with_fn_error(Prefix) ->
  {Prefix, ". Anonymous functions are written as:\n\n"
  "    fn pattern -> expression end\n\nPlease remove the \"do\" keyword"}.

% TODO: Turn into an error on v2.0
maybe_warn_too_many_of_same_char([T | _] = Token, [T | _] = _Rest, Line, Column, Scope) ->
  Warning =
    case T of
      $. -> "please use parens around \"...\" instead";
      _ -> io_lib:format("please use a space between \"~ts\" and the next \"~ts\"", [Token, [T]])
    end,
  Message = io_lib:format("found \"~ts\" followed by \"~ts\", ~ts", [Token, [T], Warning]),
  prepend_warning(Line, Column, Message, Scope);
maybe_warn_too_many_of_same_char(_Token, _Rest, _Line, _Column, Scope) ->
  Scope.

%% TODO: Turn into an error on v2.0
maybe_warn_for_ambiguous_bang_before_equals(Kind, Unencoded, [$= | _], Line, Column, Scope) ->
  {What, Identifier} =
    case Kind of
      atom -> {"atom", [$: | Unencoded]};
      identifier -> {"identifier", Unencoded}
    end,

  case lists:last(Identifier) of
    Last when Last =:= $!; Last =:= $? ->
      Msg = io_lib:format("found ~ts \"~ts\", ending with \"~ts\", followed by =. "
                          "It is unclear if you mean \"~ts ~ts=\" or \"~ts =\". Please add "
                          "a space before or after ~ts to remove the ambiguity",
                          [What, Identifier, [Last], lists:droplast(Identifier), [Last], Identifier, [Last]]),
      prepend_warning(Line, Column, Msg, Scope);
    _ ->
      Scope
  end;
maybe_warn_for_ambiguous_bang_before_equals(_Kind, _Atom, _Rest, _Line, _Column, Scope) ->
  Scope.

prepend_warning(Line, Column, Msg, #elixir_tokenizer{warnings=Warnings} = Scope) ->
  Scope#elixir_tokenizer{warnings = [{{Line, Column}, Msg} | Warnings]}.

track_ascii(true, Scope) -> Scope;
track_ascii(false, Scope) -> Scope#elixir_tokenizer{ascii_identifiers_only=false}.

maybe_unicode_lint_warnings(_Ascii=false, Tokens, Warnings) ->
  'Elixir.String.Tokenizer.Security':unicode_lint_warnings(lists:reverse(Tokens)) ++ Warnings;
maybe_unicode_lint_warnings(_Ascii=true, _Tokens, Warnings) ->
  Warnings.

error(Reason, Rest, #elixir_tokenizer{warnings=Warnings}, Tokens) ->
  {error, Reason, Rest, Warnings, Tokens}.

%% Cursor handling

cursor_complete(Line, Column, Terminators, Tokens) ->
  {AccTokens, _} =
    lists:foldl(
      fun({Start, _, _}, {NewTokens, NewColumn}) ->
        End = terminator(Start),
        AccTokens = [{End, {Line, NewColumn, nil}} | NewTokens],
        AccColumn = NewColumn + length(erlang:atom_to_list(End)),
        {AccTokens, AccColumn}
      end,
      {Tokens, Column},
      Terminators
    ),
  lists:reverse(AccTokens).

add_cursor(_Line, Column, noprune, Terminators, Tokens) ->
  {Column, Terminators, Tokens};
add_cursor(Line, Column, prune_and_cursor, Terminators, Tokens) ->
  {PrunedTokens, PrunedTerminators} = prune_tokens(Tokens, [], Terminators),
  CursorTokens = [
    {')', {Line, Column + 11, nil}},
    {'(', {Line, Column + 10, nil}},
    {paren_identifier, {Line, Column, nil}, '__cursor__'}
    | PrunedTokens
  ],
  {Column + 12, PrunedTerminators, CursorTokens}.

%%% Any terminator needs to be closed
prune_tokens([{'end', _} | Tokens], Opener, Terminators) ->
  prune_tokens(Tokens, ['end' | Opener], Terminators);
prune_tokens([{')', _} | Tokens], Opener, Terminators) ->
  prune_tokens(Tokens, [')' | Opener], Terminators);
prune_tokens([{']', _} | Tokens], Opener, Terminators) ->
  prune_tokens(Tokens, [']' | Opener], Terminators);
prune_tokens([{'}', _} | Tokens], Opener, Terminators) ->
  prune_tokens(Tokens, ['}' | Opener], Terminators);
prune_tokens([{'>>', _} | Tokens], Opener, Terminators) ->
  prune_tokens(Tokens, ['>>' | Opener], Terminators);
%%% Close opened terminators
prune_tokens([{'fn', _} | Tokens], ['end' | Opener], Terminators) ->
  prune_tokens(Tokens, Opener, Terminators);
prune_tokens([{'do', _} | Tokens], ['end' | Opener], Terminators) ->
  prune_tokens(Tokens, Opener, Terminators);
prune_tokens([{'(', _} | Tokens], [')' | Opener], Terminators) ->
  prune_tokens(Tokens, Opener, Terminators);
prune_tokens([{'[', _} | Tokens], [']' | Opener], Terminators) ->
  prune_tokens(Tokens, Opener, Terminators);
prune_tokens([{'{', _} | Tokens], ['}' | Opener], Terminators) ->
  prune_tokens(Tokens, Opener, Terminators);
prune_tokens([{'<<', _} | Tokens], ['>>' | Opener], Terminators) ->
  prune_tokens(Tokens, Opener, Terminators);
%%% Handle anonymous functions
prune_tokens(Tokens, [], [{'fn', _, _} | Terminators]) ->
  prune_tokens(drop_including(Tokens, 'fn'), [], Terminators);
prune_tokens([{'(', _}, {capture_op, _, _} | Tokens], [], [{'(', _, _} | Terminators]) ->
  prune_tokens(Tokens, [], Terminators);
%%% or it is time to stop...
prune_tokens([{',', _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{'do', _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{'(', _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{'[', _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{'{', _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{'<<', _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{block_identifier, _, _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{kw_identifier, _, _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{kw_identifier_safe, _, _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
prune_tokens([{kw_identifier_unsafe, _, _} | _] = Tokens, [], Terminators) ->
  {Tokens, Terminators};
%%% we usually skip operators, except these contextual ones
prune_tokens([{type_op, _, '::'} | _] = Tokens, [], [{'<<', _, _} | _] = Terminators) ->
  {Tokens, Terminators};
prune_tokens([{pipe_op, _, '|'} | _] = Tokens, [], [{'{', _, _} | _] = Terminators) ->
  {Tokens, Terminators};
%%% or we traverse until the end.
prune_tokens([_ | Tokens], Opener, Terminators) ->
  prune_tokens(Tokens, Opener, Terminators);
prune_tokens([], [], Terminators) ->
  {[], Terminators}.

drop_including([{Token, _} | Tokens], Token) -> Tokens;
drop_including([_ | Tokens], Token) -> drop_including(Tokens, Token);
drop_including([], _Token) -> [].