diff options
Diffstat (limited to 'Lib/sre_parse.py')
-rw-r--r-- | Lib/sre_parse.py | 102 |
1 files changed, 73 insertions, 29 deletions
diff --git a/Lib/sre_parse.py b/Lib/sre_parse.py index 045a5ebfef..8a77790b08 100644 --- a/Lib/sre_parse.py +++ b/Lib/sre_parse.py @@ -15,7 +15,6 @@ import sys from sre_constants import * -from _sre import MAXREPEAT SPECIAL_CHARS = ".\\[{()*+?^$|" REPEAT_CHARS = "*+?{" @@ -148,7 +147,7 @@ class SubPattern: REPEATCODES = (MIN_REPEAT, MAX_REPEAT) for op, av in self.data: if op is BRANCH: - i = sys.maxsize + i = MAXREPEAT - 1 j = 0 for av in av[1]: l, h = av.getwidth() @@ -166,18 +165,19 @@ class SubPattern: hi = hi + j elif op in REPEATCODES: i, j = av[2].getwidth() - lo = lo + int(i) * av[0] - hi = hi + int(j) * av[1] + lo = lo + i * av[0] + hi = hi + j * av[1] elif op in UNITCODES: lo = lo + 1 hi = hi + 1 elif op == SUCCESS: break - self.width = int(min(lo, sys.maxsize)), int(min(hi, sys.maxsize)) + self.width = min(lo, MAXREPEAT - 1), min(hi, MAXREPEAT) return self.width class Tokenizer: def __init__(self, string): + self.istext = isinstance(string, str) self.string = string self.index = 0 self.__next() @@ -188,14 +188,14 @@ class Tokenizer: char = self.string[self.index:self.index+1] # Special case for the str8, since indexing returns a integer # XXX This is only needed for test_bug_926075 in test_re.py - if char and isinstance(char, bytes): + if char and not self.istext: char = chr(char[0]) if char == "\\": try: c = self.string[self.index + 1] except IndexError: raise error("bogus escape (end of line)") - if isinstance(self.string, bytes): + if not self.istext: c = chr(c) char = char + c self.index = self.index + len(char) @@ -210,18 +210,39 @@ class Tokenizer: this = self.next self.__next() return this + def getwhile(self, n, charset): + result = '' + for _ in range(n): + c = self.next + if c not in charset: + break + result += c + self.__next() + return result def tell(self): return self.index, self.next def seek(self, index): self.index, self.next = index +# The following three functions are not used in this module anymore, but we keep +# them here (with DeprecationWarnings) for backwards compatibility. + def isident(char): + import warnings + warnings.warn('sre_parse.isident() will be removed in 3.5', + DeprecationWarning, stacklevel=2) return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_" def isdigit(char): + import warnings + warnings.warn('sre_parse.isdigit() will be removed in 3.5', + DeprecationWarning, stacklevel=2) return "0" <= char <= "9" def isname(name): + import warnings + warnings.warn('sre_parse.isname() will be removed in 3.5', + DeprecationWarning, stacklevel=2) # check that group name is a valid string if not isident(name[0]): return False @@ -242,20 +263,30 @@ def _class_escape(source, escape): c = escape[1:2] if c == "x": # hexadecimal escape (exactly two digits) - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[2:] - if len(escape) != 2: - raise error("bogus escape: %s" % repr("\\" + escape)) - return LITERAL, int(escape, 16) & 0xff + escape += source.getwhile(2, HEXDIGITS) + if len(escape) != 4: + raise ValueError + return LITERAL, int(escape[2:], 16) & 0xff + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise ValueError + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise ValueError + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c elif c in OCTDIGITS: # octal escape (up to three digits) - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() - escape = escape[1:] - return LITERAL, int(escape, 8) & 0xff + escape += source.getwhile(2, OCTDIGITS) + return LITERAL, int(escape[1:], 8) & 0xff elif c in DIGITS: - raise error("bogus escape: %s" % repr(escape)) + raise ValueError if len(escape) == 2: return LITERAL, ord(escape[1]) except ValueError: @@ -274,15 +305,27 @@ def _escape(source, escape, state): c = escape[1:2] if c == "x": # hexadecimal escape - while source.next in HEXDIGITS and len(escape) < 4: - escape = escape + source.get() + escape += source.getwhile(2, HEXDIGITS) if len(escape) != 4: raise ValueError return LITERAL, int(escape[2:], 16) & 0xff + elif c == "u" and source.istext: + # unicode escape (exactly four digits) + escape += source.getwhile(4, HEXDIGITS) + if len(escape) != 6: + raise ValueError + return LITERAL, int(escape[2:], 16) + elif c == "U" and source.istext: + # unicode escape (exactly eight digits) + escape += source.getwhile(8, HEXDIGITS) + if len(escape) != 10: + raise ValueError + c = int(escape[2:], 16) + chr(c) # raise ValueError for invalid code + return LITERAL, c elif c == "0": # octal escape - while source.next in OCTDIGITS and len(escape) < 4: - escape = escape + source.get() + escape += source.getwhile(2, OCTDIGITS) return LITERAL, int(escape[1:], 8) & 0xff elif c in DIGITS: # octal escape *or* decimal group reference (sigh) @@ -555,8 +598,8 @@ def _parse(source, state): group = 1 if not name: raise error("missing group name") - if not isname(name): - raise error("bad character in group name") + if not name.isidentifier(): + raise error("bad character in group name %r" % name) elif sourcematch("="): # named backreference name = "" @@ -569,8 +612,9 @@ def _parse(source, state): name = name + char if not name: raise error("missing group name") - if not isname(name): - raise error("bad character in group name") + if not name.isidentifier(): + raise error("bad character in backref group name " + "%r" % name) gid = state.groupdict.get(name) if gid is None: raise error("unknown group name") @@ -623,7 +667,7 @@ def _parse(source, state): group = 2 if not condname: raise error("missing group name") - if isname(condname): + if condname.isidentifier(): condgroup = state.groupdict.get(condname) if condgroup is None: raise error("unknown group name") @@ -760,7 +804,7 @@ def parse_template(source, pattern): if index < 0: raise error("negative group number") except ValueError: - if not isname(name): + if not name.isidentifier(): raise error("bad character in group name") try: index = pattern.groupindex[name] @@ -802,7 +846,7 @@ def parse_template(source, pattern): else: # The tokenizer implicitly decodes bytes objects as latin-1, we must # therefore re-encode the final representation. - encode = lambda x: x.encode('latin1') + encode = lambda x: x.encode('latin-1') for c, s in p: if c is MARK: groupsappend((i, s)) |