summaryrefslogtreecommitdiff
path: root/Lib/tokenize.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r--Lib/tokenize.py135
1 files changed, 76 insertions, 59 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index b1d0c83263..825aa90646 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -29,6 +29,7 @@ from codecs import lookup, BOM_UTF8
import collections
from io import TextIOWrapper
from itertools import chain
+import itertools as _itertools
import re
import sys
from token import *
@@ -119,19 +120,41 @@ Comment = r'#[^\r\n]*'
Ignore = Whitespace + any(r'\\\r?\n' + Whitespace) + maybe(Comment)
Name = r'\w+'
-Hexnumber = r'0[xX][0-9a-fA-F]+'
-Binnumber = r'0[bB][01]+'
-Octnumber = r'0[oO][0-7]+'
-Decnumber = r'(?:0+|[1-9][0-9]*)'
+Hexnumber = r'0[xX](?:_?[0-9a-fA-F])+'
+Binnumber = r'0[bB](?:_?[01])+'
+Octnumber = r'0[oO](?:_?[0-7])+'
+Decnumber = r'(?:0(?:_?0)*|[1-9](?:_?[0-9])*)'
Intnumber = group(Hexnumber, Binnumber, Octnumber, Decnumber)
-Exponent = r'[eE][-+]?[0-9]+'
-Pointfloat = group(r'[0-9]+\.[0-9]*', r'\.[0-9]+') + maybe(Exponent)
-Expfloat = r'[0-9]+' + Exponent
+Exponent = r'[eE][-+]?[0-9](?:_?[0-9])*'
+Pointfloat = group(r'[0-9](?:_?[0-9])*\.(?:[0-9](?:_?[0-9])*)?',
+ r'\.[0-9](?:_?[0-9])*') + maybe(Exponent)
+Expfloat = r'[0-9](?:_?[0-9])*' + Exponent
Floatnumber = group(Pointfloat, Expfloat)
-Imagnumber = group(r'[0-9]+[jJ]', Floatnumber + r'[jJ]')
+Imagnumber = group(r'[0-9](?:_?[0-9])*[jJ]', Floatnumber + r'[jJ]')
Number = group(Imagnumber, Floatnumber, Intnumber)
-StringPrefix = r'(?:[bB][rR]?|[rR][bB]?|[uU])?'
+# Return the empty string, plus all of the valid string prefixes.
+def _all_string_prefixes():
+ # The valid string prefixes. Only contain the lower case versions,
+ # and don't contain any permuations (include 'fr', but not
+ # 'rf'). The various permutations will be generated.
+ _valid_string_prefixes = ['b', 'r', 'u', 'f', 'br', 'fr']
+ # if we add binary f-strings, add: ['fb', 'fbr']
+ result = set([''])
+ for prefix in _valid_string_prefixes:
+ for t in _itertools.permutations(prefix):
+ # create a list with upper and lower versions of each
+ # character
+ for u in _itertools.product(*[(c, c.upper()) for c in t]):
+ result.add(''.join(u))
+ return result
+
+def _compile(expr):
+ return re.compile(expr, re.UNICODE)
+
+# Note that since _all_string_prefixes includes the empty string,
+# StringPrefix can be the empty string (making it optional).
+StringPrefix = group(*_all_string_prefixes())
# Tail end of ' string.
Single = r"[^'\\]*(?:\\.[^'\\]*)*'"
@@ -169,50 +192,25 @@ ContStr = group(StringPrefix + r"'[^\n'\\]*(?:\\.[^\n'\\]*)*" +
PseudoExtras = group(r'\\\r?\n|\Z', Comment, Triple)
PseudoToken = Whitespace + group(PseudoExtras, Number, Funny, ContStr, Name)
-def _compile(expr):
- return re.compile(expr, re.UNICODE)
-
-endpats = {"'": Single, '"': Double,
- "'''": Single3, '"""': Double3,
- "r'''": Single3, 'r"""': Double3,
- "b'''": Single3, 'b"""': Double3,
- "R'''": Single3, 'R"""': Double3,
- "B'''": Single3, 'B"""': Double3,
- "br'''": Single3, 'br"""': Double3,
- "bR'''": Single3, 'bR"""': Double3,
- "Br'''": Single3, 'Br"""': Double3,
- "BR'''": Single3, 'BR"""': Double3,
- "rb'''": Single3, 'rb"""': Double3,
- "Rb'''": Single3, 'Rb"""': Double3,
- "rB'''": Single3, 'rB"""': Double3,
- "RB'''": Single3, 'RB"""': Double3,
- "u'''": Single3, 'u"""': Double3,
- "U'''": Single3, 'U"""': Double3,
- 'r': None, 'R': None, 'b': None, 'B': None,
- 'u': None, 'U': None}
-
-triple_quoted = {}
-for t in ("'''", '"""',
- "r'''", 'r"""', "R'''", 'R"""',
- "b'''", 'b"""', "B'''", 'B"""',
- "br'''", 'br"""', "Br'''", 'Br"""',
- "bR'''", 'bR"""', "BR'''", 'BR"""',
- "rb'''", 'rb"""', "rB'''", 'rB"""',
- "Rb'''", 'Rb"""', "RB'''", 'RB"""',
- "u'''", 'u"""', "U'''", 'U"""',
- ):
- triple_quoted[t] = t
-single_quoted = {}
-for t in ("'", '"',
- "r'", 'r"', "R'", 'R"',
- "b'", 'b"', "B'", 'B"',
- "br'", 'br"', "Br'", 'Br"',
- "bR'", 'bR"', "BR'", 'BR"' ,
- "rb'", 'rb"', "rB'", 'rB"',
- "Rb'", 'Rb"', "RB'", 'RB"' ,
- "u'", 'u"', "U'", 'U"',
- ):
- single_quoted[t] = t
+# For a given string prefix plus quotes, endpats maps it to a regex
+# to match the remainder of that string. _prefix can be empty, for
+# a normal single or triple quoted string (with no prefix).
+endpats = {}
+for _prefix in _all_string_prefixes():
+ endpats[_prefix + "'"] = Single
+ endpats[_prefix + '"'] = Double
+ endpats[_prefix + "'''"] = Single3
+ endpats[_prefix + '"""'] = Double3
+
+# A set of all of the single and triple quoted string prefixes,
+# including the opening quotes.
+single_quoted = set()
+triple_quoted = set()
+for t in _all_string_prefixes():
+ for u in (t + '"', t + "'"):
+ single_quoted.add(u)
+ for u in (t + '"""', t + "'''"):
+ triple_quoted.add(u)
tabsize = 8
@@ -626,6 +624,7 @@ def _tokenize(readline, encoding):
yield stashed
stashed = None
yield TokenInfo(COMMENT, token, spos, epos, line)
+
elif token in triple_quoted:
endprog = _compile(endpats[token])
endmatch = endprog.match(line, pos)
@@ -638,19 +637,37 @@ def _tokenize(readline, encoding):
contstr = line[start:]
contline = line
break
- elif initial in single_quoted or \
- token[:2] in single_quoted or \
- token[:3] in single_quoted:
+
+ # Check up to the first 3 chars of the token to see if
+ # they're in the single_quoted set. If so, they start
+ # a string.
+ # We're using the first 3, because we're looking for
+ # "rb'" (for example) at the start of the token. If
+ # we switch to longer prefixes, this needs to be
+ # adjusted.
+ # Note that initial == token[:1].
+ # Also note that single quote checking must come after
+ # triple quote checking (above).
+ elif (initial in single_quoted or
+ token[:2] in single_quoted or
+ token[:3] in single_quoted):
if token[-1] == '\n': # continued string
strstart = (lnum, start)
- endprog = _compile(endpats[initial] or
- endpats[token[1]] or
- endpats[token[2]])
+ # Again, using the first 3 chars of the
+ # token. This is looking for the matching end
+ # regex for the correct type of quote
+ # character. So it's really looking for
+ # endpats["'"] or endpats['"'], by trying to
+ # skip string prefix characters, if any.
+ endprog = _compile(endpats.get(initial) or
+ endpats.get(token[1]) or
+ endpats.get(token[2]))
contstr, needcont = line[start:], 1
contline = line
break
else: # ordinary string
yield TokenInfo(STRING, token, spos, epos, line)
+
elif initial.isidentifier(): # ordinary name
if token in ('async', 'await'):
if async_def: