summaryrefslogtreecommitdiff
path: root/Lib/tokenize.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r--Lib/tokenize.py272
1 files changed, 201 insertions, 71 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index 29c9e29b30..5304de5941 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -25,13 +25,16 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
'Skip Montanaro, Raymond Hettinger, Trent Nelson, '
'Michael Foord')
import builtins
-import re
-import sys
-from token import *
from codecs import lookup, BOM_UTF8
import collections
from io import TextIOWrapper
-cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
+from itertools import chain
+import re
+import sys
+from token import *
+
+cookie_re = re.compile(r'^[ \t\f]*#.*coding[:=][ \t]*([-\w.]+)', re.ASCII)
+blank_re = re.compile(br'^[ \t\f]*(?:[#\r\n]|$)', re.ASCII)
import token
__all__ = token.__all__ + ["COMMENT", "tokenize", "detect_encoding",
@@ -45,6 +48,51 @@ tok_name[NL] = 'NL'
ENCODING = N_TOKENS + 2
tok_name[ENCODING] = 'ENCODING'
N_TOKENS += 3
+EXACT_TOKEN_TYPES = {
+ '(': LPAR,
+ ')': RPAR,
+ '[': LSQB,
+ ']': RSQB,
+ ':': COLON,
+ ',': COMMA,
+ ';': SEMI,
+ '+': PLUS,
+ '-': MINUS,
+ '*': STAR,
+ '/': SLASH,
+ '|': VBAR,
+ '&': AMPER,
+ '<': LESS,
+ '>': GREATER,
+ '=': EQUAL,
+ '.': DOT,
+ '%': PERCENT,
+ '{': LBRACE,
+ '}': RBRACE,
+ '==': EQEQUAL,
+ '!=': NOTEQUAL,
+ '<=': LESSEQUAL,
+ '>=': GREATEREQUAL,
+ '~': TILDE,
+ '^': CIRCUMFLEX,
+ '<<': LEFTSHIFT,
+ '>>': RIGHTSHIFT,
+ '**': DOUBLESTAR,
+ '+=': PLUSEQUAL,
+ '-=': MINEQUAL,
+ '*=': STAREQUAL,
+ '/=': SLASHEQUAL,
+ '%=': PERCENTEQUAL,
+ '&=': AMPEREQUAL,
+ '|=': VBAREQUAL,
+ '^=': CIRCUMFLEXEQUAL,
+ '<<=': LEFTSHIFTEQUAL,
+ '>>=': RIGHTSHIFTEQUAL,
+ '**=': DOUBLESTAREQUAL,
+ '//': DOUBLESLASH,
+ '//=': DOUBLESLASHEQUAL,
+ '@': AT
+}
class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line')):
def __repr__(self):
@@ -52,6 +100,13 @@ class TokenInfo(collections.namedtuple('TokenInfo', 'type string start end line'
return ('TokenInfo(type=%s, string=%r, start=%r, end=%r, line=%r)' %
self._replace(type=annotated_type))
+ @property
+ def exact_type(self):
+ if self.type == OP and self.string in EXACT_TOKEN_TYPES:
+ return EXACT_TOKEN_TYPES[self.string]
+ else:
+ return self.type
+
def group(*choices): return '(' + '|'.join(choices) + ')'
def any(*choices): return group(*choices) + '*'
def maybe(*choices): return group(*choices) + '?'
@@ -75,6 +130,8 @@ Floatnumber = group(Pointfloat, Expfloat)
Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
+StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
+
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
# Tail end of " string.
@@ -83,10 +140,10 @@ Double = r'[^"\\]*(?:\\.[^"\\]*)*"'
Single3 = r"[^'\\]*(?:(?:\\.|'(?!''))[^'\\]*)*'''"
# Tail end of """ string.
Double3 = r'[^"\\]*(?:(?:\\.|"(?!""))[^"\\]*)*"""'
-Triple = group("[bB]?[rR]?'''", '[bB]?[rR]?"""')
+Triple = group(StringPrefix + "'''", StringPrefix + '"""')
# Single-line ' or " string.
-String = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
- r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
+String = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*'",
+ StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*"')
# Because of leftmost-then-longest match semantics, be sure to put the
# longest operators first (e.g., if = came before ==, == would get
@@ -104,9 +161,9 @@ PlainToken = group(Number, Funny, String, Name)
Token = Ignore + PlainToken
# First (or only) line of ' or " string.
-ContStr = group(r"[bB]?[rR]?'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
+ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
group("'", r'\\\r?\n'),
- r'[bB]?[rR]?"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
+ StringPrefix + r'"[^\n"\\]*(?:\\.[^\n"\\]*)*' +
group('"', r'\\\r?\n'))
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
@@ -114,37 +171,49 @@ PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
def _compile(expr):
return re.compile(expr, re.UNICODE)
-tokenprog, pseudoprog, single3prog, double3prog = map(
- _compile, (Token, PseudoToken, Single3, Double3))
-endprogs = {"'": _compile(Single), '"': _compile(Double),
- "'''": single3prog, '"""': double3prog,
- "r'''": single3prog, 'r"""': double3prog,
- "b'''": single3prog, 'b"""': double3prog,
- "br'''": single3prog, 'br"""': double3prog,
- "R'''": single3prog, 'R"""': double3prog,
- "B'''": single3prog, 'B"""': double3prog,
- "bR'''": single3prog, 'bR"""': double3prog,
- "Br'''": single3prog, 'Br"""': double3prog,
- "BR'''": single3prog, 'BR"""': double3prog,
- 'r': None, 'R': None, 'b': None, 'B': None}
+endpats = {"'": Single, '"': Double,
+ "'''": Single3, '"""': Double3,
+ "r'''": Single3, 'r"""': Double3,
+ "b'''": Single3, 'b"""': Double3,
+ "R'''": Single3, 'R"""': Double3,
+ "B'''": Single3, 'B"""': Double3,
+ "br'''": Single3, 'br"""': Double3,
+ "bR'''": Single3, 'bR"""': Double3,
+ "Br'''": Single3, 'Br"""': Double3,
+ "BR'''": Single3, 'BR"""': Double3,
+ "rb'''": Single3, 'rb"""': Double3,
+ "Rb'''": Single3, 'Rb"""': Double3,
+ "rB'''": Single3, 'rB"""': Double3,
+ "RB'''": Single3, 'RB"""': Double3,
+ "u'''": Single3, 'u"""': Double3,
+ "R'''": Single3, 'R"""': Double3,
+ "U'''": Single3, 'U"""': Double3,
+ 'r': None, 'R': None, 'b': None, 'B': None,
+ 'u': None, 'U': None}
triple_quoted = {}
for t in ("'''", '"""',
"r'''", 'r"""', "R'''", 'R"""',
"b'''", 'b"""', "B'''", 'B"""',
"br'''", 'br"""', "Br'''", 'Br"""',
- "bR'''", 'bR"""', "BR'''", 'BR"""'):
+ "bR'''", 'bR"""', "BR'''", 'BR"""',
+ "rb'''", 'rb"""', "rB'''", 'rB"""',
+ "Rb'''", 'Rb"""', "RB'''", 'RB"""',
+ "u'''", 'u"""', "U'''", 'U"""',
+ ):
triple_quoted[t] = t
single_quoted = {}
for t in ("'", '"',
"r'", 'r"', "R'", 'R"',
"b'", 'b"', "B'", 'B"',
"br'", 'br"', "Br'", 'Br"',
- "bR'", 'bR"', "BR'", 'BR"' ):
+ "bR'", 'bR"', "BR'", 'BR"' ,
+ "rb'", 'rb"', "rB'", 'rB"',
+ "Rb'", 'Rb"', "RB'", 'RB"' ,
+ "u'", 'u"', "U'", 'U"',
+ ):
single_quoted[t] = t
-del _compile
-
tabsize = 8
class TokenError(Exception): pass
@@ -162,20 +231,29 @@ class Untokenizer:
def add_whitespace(self, start):
row, col = start
- assert row <= self.prev_row
+ if row < self.prev_row or row == self.prev_row and col < self.prev_col:
+ raise ValueError("start ({},{}) precedes previous end ({},{})"
+ .format(row, col, self.prev_row, self.prev_col))
+ row_offset = row - self.prev_row
+ if row_offset:
+ self.tokens.append("\\\n" * row_offset)
+ self.prev_col = 0
col_offset = col - self.prev_col
if col_offset:
self.tokens.append(" " * col_offset)
def untokenize(self, iterable):
- for t in iterable:
+ it = iter(iterable)
+ for t in it:
if len(t) == 2:
- self.compat(t, iterable)
+ self.compat(t, it)
break
tok_type, token, start, end, line = t
if tok_type == ENCODING:
self.encoding = token
continue
+ if tok_type == ENDMARKER:
+ break
self.add_whitespace(start)
self.tokens.append(token)
self.prev_row, self.prev_col = end
@@ -185,17 +263,12 @@ class Untokenizer:
return "".join(self.tokens)
def compat(self, token, iterable):
- startline = False
indents = []
toks_append = self.tokens.append
- toknum, tokval = token
-
- if toknum in (NAME, NUMBER):
- tokval += ' '
- if toknum in (NEWLINE, NL):
- startline = True
+ startline = token[0] in (NEWLINE, NL)
prevstring = False
- for tok in iterable:
+
+ for tok in chain([token], iterable):
toknum, tokval = tok[:2]
if toknum == ENCODING:
self.encoding = tokval
@@ -267,7 +340,7 @@ def _get_normal_name(orig_enc):
def detect_encoding(readline):
"""
The detect_encoding() function is used to detect the encoding that should
- be used to decode a Python source file. It requires one argment, readline,
+ be used to decode a Python source file. It requires one argument, readline,
in the same way as the tokenize() generator.
It will call readline a maximum of twice, and return the encoding used
@@ -281,6 +354,10 @@ def detect_encoding(readline):
If no encoding is specified, then the default of 'utf-8' will be returned.
"""
+ try:
+ filename = readline.__self__.name
+ except AttributeError:
+ filename = None
bom_found = False
encoding = None
default = 'utf-8'
@@ -297,22 +374,34 @@ def detect_encoding(readline):
# per default encoding.
line_string = line.decode('utf-8')
except UnicodeDecodeError:
- raise SyntaxError("invalid or missing encoding declaration")
+ msg = "invalid or missing encoding declaration"
+ if filename is not None:
+ msg = '{} for {!r}'.format(msg, filename)
+ raise SyntaxError(msg)
- matches = cookie_re.findall(line_string)
- if not matches:
+ match = cookie_re.match(line_string)
+ if not match:
return None
- encoding = _get_normal_name(matches[0])
+ encoding = _get_normal_name(match.group(1))
try:
codec = lookup(encoding)
except LookupError:
# This behaviour mimics the Python interpreter
- raise SyntaxError("unknown encoding: " + encoding)
+ if filename is None:
+ msg = "unknown encoding: " + encoding
+ else:
+ msg = "unknown encoding for {!r}: {}".format(filename,
+ encoding)
+ raise SyntaxError(msg)
if bom_found:
if encoding != 'utf-8':
# This behaviour mimics the Python interpreter
- raise SyntaxError('encoding problem: utf-8')
+ if filename is None:
+ msg = 'encoding problem: utf-8'
+ else:
+ msg = 'encoding problem for {!r}: utf-8'.format(filename)
+ raise SyntaxError(msg)
encoding += '-sig'
return encoding
@@ -327,6 +416,8 @@ def detect_encoding(readline):
encoding = find_cookie(first)
if encoding:
return encoding, [first]
+ if not blank_re.match(first):
+ return default, [first]
second = read_or_stop()
if not second:
@@ -469,7 +560,7 @@ def _tokenize(readline, encoding):
continued = 0
while pos < max:
- pseudomatch = pseudoprog.match(line, pos)
+ pseudomatch = _compile(PseudoToken).match(line, pos)
if pseudomatch: # scan for tokens
start, end = pseudomatch.span(1)
spos, epos, pos = (lnum, start), (lnum, end), end
@@ -487,7 +578,7 @@ def _tokenize(readline, encoding):
assert not token.endswith("\n")
yield TokenInfo(COMMENT, token, spos, epos, line)
elif token in triple_quoted:
- endprog = endprogs[token]
+ endprog = _compile(endpats[token])
endmatch = endprog.match(line, pos)
if endmatch: # all on one line
pos = endmatch.end(0)
@@ -503,8 +594,9 @@ def _tokenize(readline, encoding):
token[:3] in single_quoted:
if token[-1] == '\n': # continued string
strstart = (lnum, start)
- endprog = (endprogs[initial] or endprogs[token[1]] or
- endprogs[token[2]])
+ endprog = _compile(endpats[initial] or
+ endpats[token[1]] or
+ endpats[token[2]])
contstr, needcont = line[start:], 1
contline = line
break
@@ -535,27 +627,65 @@ def _tokenize(readline, encoding):
def generate_tokens(readline):
return _tokenize(readline, None)
+def main():
+ import argparse
+
+ # Helper error handling routines
+ def perror(message):
+ print(message, file=sys.stderr)
+
+ def error(message, filename=None, location=None):
+ if location:
+ args = (filename,) + location + (message,)
+ perror("%s:%d:%d: error: %s" % args)
+ elif filename:
+ perror("%s: error: %s" % (filename, message))
+ else:
+ perror("error: %s" % message)
+ sys.exit(1)
+
+ # Parse the arguments and options
+ parser = argparse.ArgumentParser(prog='python -m tokenize')
+ parser.add_argument(dest='filename', nargs='?',
+ metavar='filename.py',
+ help='the file to tokenize; defaults to stdin')
+ parser.add_argument('-e', '--exact', dest='exact', action='store_true',
+ help='display token names using the exact type')
+ args = parser.parse_args()
+
+ try:
+ # Tokenize the input
+ if args.filename:
+ filename = args.filename
+ with builtins.open(filename, 'rb') as f:
+ tokens = list(tokenize(f.readline))
+ else:
+ filename = "<stdin>"
+ tokens = _tokenize(sys.stdin.readline, None)
+
+ # Output the tokenization
+ for token in tokens:
+ token_type = token.type
+ if args.exact:
+ token_type = token.exact_type
+ token_range = "%d,%d-%d,%d:" % (token.start + token.end)
+ print("%-20s%-15s%-15r" %
+ (token_range, tok_name[token_type], token.string))
+ except IndentationError as err:
+ line, column = err.args[1][1:3]
+ error(err.args[0], filename, (line, column))
+ except TokenError as err:
+ line, column = err.args[1]
+ error(err.args[0], filename, (line, column))
+ except SyntaxError as err:
+ error(err, filename)
+ except IOError as err:
+ error(err)
+ except KeyboardInterrupt:
+ print("interrupted\n")
+ except Exception as err:
+ perror("unexpected error: %s" % err)
+ raise
+
if __name__ == "__main__":
- # Quick sanity check
- s = b'''def parseline(self, line):
- """Parse the line into a command name and a string containing
- the arguments. Returns a tuple containing (command, args, line).
- 'command' and 'args' may be None if the line couldn't be parsed.
- """
- line = line.strip()
- if not line:
- return None, None, line
- elif line[0] == '?':
- line = 'help ' + line[1:]
- elif line[0] == '!':
- if hasattr(self, 'do_shell'):
- line = 'shell ' + line[1:]
- else:
- return None, None, line
- i, n = 0, len(line)
- while i < n and line[i] in self.identchars: i = i+1
- cmd, arg = line[:i], line[i:].strip()
- return cmd, arg, line
- '''
- for tok in tokenize(iter(s.splitlines()).__next__):
- print(tok)
+ main()