summaryrefslogtreecommitdiff
path: root/Lib/sre_parse.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r--Lib/sre_parse.py102
1 files changed, 73 insertions, 29 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 045a5ebfef..8a77790b08 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -15,7 +15,6 @@
import sys
from sre_constants import *
-from _sre import MAXREPEAT
SPECIAL_CHARS = ".\\[{()*+?^$|"
REPEAT_CHARS = "*+?{"
@@ -148,7 +147,7 @@ class SubPattern:
REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
for op, av in self.data:
if op is BRANCH:
- i = sys.maxsize
+ i = MAXREPEAT - 1
j = 0
for av in av[1]:
l, h = av.getwidth()
@@ -166,18 +165,19 @@ class SubPattern:
hi = hi + j
elif op in REPEATCODES:
i, j = av[2].getwidth()
- lo = lo + int(i) * av[0]
- hi = hi + int(j) * av[1]
+ lo = lo + i * av[0]
+ hi = hi + j * av[1]
elif op in UNITCODES:
lo = lo + 1
hi = hi + 1
elif op == SUCCESS:
break
- self.width = int(min(lo, sys.maxsize)), int(min(hi, sys.maxsize))
+ self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
return self.width
class Tokenizer:
def __init__(self, string):
+ self.istext = isinstance(string, str)
self.string = string
self.index = 0
self.__next()
@@ -188,14 +188,14 @@ class Tokenizer:
char = self.string[self.index:self.index+1]
# Special case for the str8, since indexing returns a integer
# XXX This is only needed for test_bug_926075 in test_re.py
- if char and isinstance(char, bytes):
+ if char and not self.istext:
char = chr(char[0])
if char == "\\":
try:
c = self.string[self.index + 1]
except IndexError:
raise error("bogus escape (end of line)")
- if isinstance(self.string, bytes):
+ if not self.istext:
c = chr(c)
char = char + c
self.index = self.index + len(char)
@@ -210,18 +210,39 @@ class Tokenizer:
this = self.next
self.__next()
return this
+ def getwhile(self, n, charset):
+ result = ''
+ for _ in range(n):
+ c = self.next
+ if c not in charset:
+ break
+ result += c
+ self.__next()
+ return result
def tell(self):
return self.index, self.next
def seek(self, index):
self.index, self.next = index
+# The following three functions are not used in this module anymore, but we keep
+# them here (with DeprecationWarnings) for backwards compatibility.
+
def isident(char):
+ import warnings
+ warnings.warn('sre_parse.isident() will be removed in 3.5',
+ DeprecationWarning, stacklevel=2)
return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
def isdigit(char):
+ import warnings
+ warnings.warn('sre_parse.isdigit() will be removed in 3.5',
+ DeprecationWarning, stacklevel=2)
return "0" <= char <= "9"
def isname(name):
+ import warnings
+ warnings.warn('sre_parse.isname() will be removed in 3.5',
+ DeprecationWarning, stacklevel=2)
# check that group name is a valid string
if not isident(name[0]):
return False
@@ -242,20 +263,30 @@ def _class_escape(source, escape):
c = escape[1:2]
if c == "x":
# hexadecimal escape (exactly two digits)
- while source.next in HEXDIGITS and len(escape) < 4:
- escape = escape + source.get()
- escape = escape[2:]
- if len(escape) != 2:
- raise error("bogus escape: %s" % repr("\\" + escape))
- return LITERAL, int(escape, 16) & 0xff
+ escape += source.getwhile(2, HEXDIGITS)
+ if len(escape) != 4:
+ raise ValueError
+ return LITERAL, int(escape[2:], 16) & 0xff
+ elif c == "u" and source.istext:
+ # unicode escape (exactly four digits)
+ escape += source.getwhile(4, HEXDIGITS)
+ if len(escape) != 6:
+ raise ValueError
+ return LITERAL, int(escape[2:], 16)
+ elif c == "U" and source.istext:
+ # unicode escape (exactly eight digits)
+ escape += source.getwhile(8, HEXDIGITS)
+ if len(escape) != 10:
+ raise ValueError
+ c = int(escape[2:], 16)
+ chr(c) # raise ValueError for invalid code
+ return LITERAL, c
elif c in OCTDIGITS:
# octal escape (up to three digits)
- while source.next in OCTDIGITS and len(escape) < 4:
- escape = escape + source.get()
- escape = escape[1:]
- return LITERAL, int(escape, 8) & 0xff
+ escape += source.getwhile(2, OCTDIGITS)
+ return LITERAL, int(escape[1:], 8) & 0xff
elif c in DIGITS:
- raise error("bogus escape: %s" % repr(escape))
+ raise ValueError
if len(escape) == 2:
return LITERAL, ord(escape[1])
except ValueError:
@@ -274,15 +305,27 @@ def _escape(source, escape, state):
c = escape[1:2]
if c == "x":
# hexadecimal escape
- while source.next in HEXDIGITS and len(escape) < 4:
- escape = escape + source.get()
+ escape += source.getwhile(2, HEXDIGITS)
if len(escape) != 4:
raise ValueError
return LITERAL, int(escape[2:], 16) & 0xff
+ elif c == "u" and source.istext:
+ # unicode escape (exactly four digits)
+ escape += source.getwhile(4, HEXDIGITS)
+ if len(escape) != 6:
+ raise ValueError
+ return LITERAL, int(escape[2:], 16)
+ elif c == "U" and source.istext:
+ # unicode escape (exactly eight digits)
+ escape += source.getwhile(8, HEXDIGITS)
+ if len(escape) != 10:
+ raise ValueError
+ c = int(escape[2:], 16)
+ chr(c) # raise ValueError for invalid code
+ return LITERAL, c
elif c == "0":
# octal escape
- while source.next in OCTDIGITS and len(escape) < 4:
- escape = escape + source.get()
+ escape += source.getwhile(2, OCTDIGITS)
return LITERAL, int(escape[1:], 8) & 0xff
elif c in DIGITS:
# octal escape *or* decimal group reference (sigh)
@@ -555,8 +598,8 @@ def _parse(source, state):
group = 1
if not name:
raise error("missing group name")
- if not isname(name):
- raise error("bad character in group name")
+ if not name.isidentifier():
+ raise error("bad character in group name %r" % name)
elif sourcematch("="):
# named backreference
name = ""
@@ -569,8 +612,9 @@ def _parse(source, state):
name = name + char
if not name:
raise error("missing group name")
- if not isname(name):
- raise error("bad character in group name")
+ if not name.isidentifier():
+ raise error("bad character in backref group name "
+ "%r" % name)
gid = state.groupdict.get(name)
if gid is None:
raise error("unknown group name")
@@ -623,7 +667,7 @@ def _parse(source, state):
group = 2
if not condname:
raise error("missing group name")
- if isname(condname):
+ if condname.isidentifier():
condgroup = state.groupdict.get(condname)
if condgroup is None:
raise error("unknown group name")
@@ -760,7 +804,7 @@ def parse_template(source, pattern):
if index < 0:
raise error("negative group number")
except ValueError:
- if not isname(name):
+ if not name.isidentifier():
raise error("bad character in group name")
try:
index = pattern.groupindex[name]
@@ -802,7 +846,7 @@ def parse_template(source, pattern):
else:
# The tokenizer implicitly decodes bytes objects as latin-1, we must
# therefore re-encode the final representation.
- encode = lambda x: x.encode('latin1')
+ encode = lambda x: x.encode('latin-1')
for c, s in p:
if c is MARK:
groupsappend((i, s))