1 files changed, 73 insertions, 29 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py
index 045a5ebfef..8a77790b08 100644
--- a/Lib/sre_parse.py
+++ b/Lib/sre_parse.py
@@ -15,7 +15,6 @@
 import sys
 
 from sre_constants import *
-from _sre import MAXREPEAT
 
 SPECIAL_CHARS = ".\\[{()*+?^$|"
 REPEAT_CHARS = "*+?{"
@@ -148,7 +147,7 @@ class SubPattern:
         REPEATCODES = (MIN_REPEAT, MAX_REPEAT)
         for op, av in self.data:
             if op is BRANCH:
-                i = sys.maxsize
+                i = MAXREPEAT - 1
                 j = 0
                 for av in av[1]:
                     l, h = av.getwidth()
@@ -166,18 +165,19 @@ class SubPattern:
                 hi = hi + j
             elif op in REPEATCODES:
                 i, j = av[2].getwidth()
-                lo = lo + int(i) * av[0]
-                hi = hi + int(j) * av[1]
+                lo = lo + i * av[0]
+                hi = hi + j * av[1]
             elif op in UNITCODES:
                 lo = lo + 1
                 hi = hi + 1
             elif op == SUCCESS:
                 break
-        self.width = int(min(lo, sys.maxsize)), int(min(hi, sys.maxsize))
+        self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT)
         return self.width
 
 class Tokenizer:
     def __init__(self, string):
+        self.istext = isinstance(string, str)
         self.string = string
         self.index = 0
         self.__next()
@@ -188,14 +188,14 @@ class Tokenizer:
         char = self.string[self.index:self.index+1]
         # Special case for the str8, since indexing returns a integer
         # XXX This is only needed for test_bug_926075 in test_re.py
-        if char and isinstance(char, bytes):
+        if char and not self.istext:
             char = chr(char[0])
         if char == "\\":
             try:
                 c = self.string[self.index + 1]
             except IndexError:
                 raise error("bogus escape (end of line)")
-            if isinstance(self.string, bytes):
+            if not self.istext:
                 c = chr(c)
             char = char + c
         self.index = self.index + len(char)
@@ -210,18 +210,39 @@ class Tokenizer:
         this = self.next
         self.__next()
         return this
+    def getwhile(self, n, charset):
+        result = ''
+        for _ in range(n):
+            c = self.next
+            if c not in charset:
+                break
+            result += c
+            self.__next()
+        return result
     def tell(self):
         return self.index, self.next
     def seek(self, index):
         self.index, self.next = index
 
+# The following three functions are not used in this module anymore, but we keep
+# them here (with DeprecationWarnings) for backwards compatibility.
+
 def isident(char):
+    import warnings
+    warnings.warn('sre_parse.isident() will be removed in 3.5',
+                  DeprecationWarning, stacklevel=2)
     return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"
 
 def isdigit(char):
+    import warnings
+    warnings.warn('sre_parse.isdigit() will be removed in 3.5',
+                  DeprecationWarning, stacklevel=2)
     return "0" <= char <= "9"
 
 def isname(name):
+    import warnings
+    warnings.warn('sre_parse.isname() will be removed in 3.5',
+                  DeprecationWarning, stacklevel=2)
     # check that group name is a valid string
     if not isident(name[0]):
         return False
@@ -242,20 +263,30 @@ def _class_escape(source, escape):
         c = escape[1:2]
         if c == "x":
             # hexadecimal escape (exactly two digits)
-            while source.next in HEXDIGITS and len(escape) < 4:
-                escape = escape + source.get()
-            escape = escape[2:]
-            if len(escape) != 2:
-                raise error("bogus escape: %s" % repr("\\" + escape))
-            return LITERAL, int(escape, 16) & 0xff
+            escape += source.getwhile(2, HEXDIGITS)
+            if len(escape) != 4:
+                raise ValueError
+            return LITERAL, int(escape[2:], 16) & 0xff
+        elif c == "u" and source.istext:
+            # unicode escape (exactly four digits)
+            escape += source.getwhile(4, HEXDIGITS)
+            if len(escape) != 6:
+                raise ValueError
+            return LITERAL, int(escape[2:], 16)
+        elif c == "U" and source.istext:
+            # unicode escape (exactly eight digits)
+            escape += source.getwhile(8, HEXDIGITS)
+            if len(escape) != 10:
+                raise ValueError
+            c = int(escape[2:], 16)
+            chr(c) # raise ValueError for invalid code
+            return LITERAL, c
         elif c in OCTDIGITS:
             # octal escape (up to three digits)
-            while source.next in OCTDIGITS and len(escape) < 4:
-                escape = escape + source.get()
-            escape = escape[1:]
-            return LITERAL, int(escape, 8) & 0xff
+            escape += source.getwhile(2, OCTDIGITS)
+            return LITERAL, int(escape[1:], 8) & 0xff
         elif c in DIGITS:
-            raise error("bogus escape: %s" % repr(escape))
+            raise ValueError
         if len(escape) == 2:
             return LITERAL, ord(escape[1])
     except ValueError:
@@ -274,15 +305,27 @@ def _escape(source, escape, state):
         c = escape[1:2]
         if c == "x":
             # hexadecimal escape
-            while source.next in HEXDIGITS and len(escape) < 4:
-                escape = escape + source.get()
+            escape += source.getwhile(2, HEXDIGITS)
             if len(escape) != 4:
                 raise ValueError
             return LITERAL, int(escape[2:], 16) & 0xff
+        elif c == "u" and source.istext:
+            # unicode escape (exactly four digits)
+            escape += source.getwhile(4, HEXDIGITS)
+            if len(escape) != 6:
+                raise ValueError
+            return LITERAL, int(escape[2:], 16)
+        elif c == "U" and source.istext:
+            # unicode escape (exactly eight digits)
+            escape += source.getwhile(8, HEXDIGITS)
+            if len(escape) != 10:
+                raise ValueError
+            c = int(escape[2:], 16)
+            chr(c) # raise ValueError for invalid code
+            return LITERAL, c
         elif c == "0":
             # octal escape
-            while source.next in OCTDIGITS and len(escape) < 4:
-                escape = escape + source.get()
+            escape += source.getwhile(2, OCTDIGITS)
             return LITERAL, int(escape[1:], 8) & 0xff
         elif c in DIGITS:
             # octal escape *or* decimal group reference (sigh)
@@ -555,8 +598,8 @@ def _parse(source, state):
                         group = 1
                         if not name:
                             raise error("missing group name")
-                        if not isname(name):
-                            raise error("bad character in group name")
+                        if not name.isidentifier():
+                            raise error("bad character in group name %r" % name)
                     elif sourcematch("="):
                         # named backreference
                         name = ""
@@ -569,8 +612,9 @@ def _parse(source, state):
                             name = name + char
                         if not name:
                             raise error("missing group name")
-                        if not isname(name):
-                            raise error("bad character in group name")
+                        if not name.isidentifier():
+                            raise error("bad character in backref group name "
+                                        "%r" % name)
                         gid = state.groupdict.get(name)
                         if gid is None:
                             raise error("unknown group name")
@@ -623,7 +667,7 @@ def _parse(source, state):
                     group = 2
                     if not condname:
                         raise error("missing group name")
-                    if isname(condname):
+                    if condname.isidentifier():
                         condgroup = state.groupdict.get(condname)
                         if condgroup is None:
                             raise error("unknown group name")
@@ -760,7 +804,7 @@ def parse_template(source, pattern):
                     if index < 0:
                         raise error("negative group number")
                 except ValueError:
-                    if not isname(name):
+                    if not name.isidentifier():
                         raise error("bad character in group name")
                     try:
                         index = pattern.groupindex[name]
@@ -802,7 +846,7 @@ def parse_template(source, pattern):
     else:
         # The tokenizer implicitly decodes bytes objects as latin-1, we must
         # therefore re-encode the final representation.
-        encode = lambda x: x.encode('latin1')
+        encode = lambda x: x.encode('latin-1')
     for c, s in p:
         if c is MARK:
             groupsappend((i, s))