summaryrefslogtreecommitdiff
path: root/Lib/sre_compile.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r--Lib/sre_compile.py184
1 files changed, 73 insertions, 111 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 550ea158af..502b0616c6 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -13,19 +13,13 @@
import _sre
import sre_parse
from sre_constants import *
-from _sre import MAXREPEAT
assert _sre.MAGIC == MAGIC, "SRE module mismatch"
-if _sre.CODESIZE == 2:
- MAXCODE = 65535
-else:
- MAXCODE = 0xFFFFFFFF
-
-_LITERAL_CODES = set([LITERAL, NOT_LITERAL])
-_REPEATING_CODES = set([REPEAT, MIN_REPEAT, MAX_REPEAT])
-_SUCCESS_CODES = set([SUCCESS, FAILURE])
-_ASSERT_CODES = set([ASSERT, ASSERT_NOT])
+_LITERAL_CODES = {LITERAL, NOT_LITERAL}
+_REPEATING_CODES = {REPEAT, MIN_REPEAT, MAX_REPEAT}
+_SUCCESS_CODES = {SUCCESS, FAILURE}
+_ASSERT_CODES = {ASSERT, ASSERT_NOT}
# Sets of lowercase characters which have the same uppercase.
_equivalences = (
@@ -86,75 +80,75 @@ def _compile(code, pattern, flags):
if flags & SRE_FLAG_IGNORECASE:
lo = _sre.getlower(av, flags)
if fixes and lo in fixes:
- emit(OPCODES[IN_IGNORE])
+ emit(IN_IGNORE)
skip = _len(code); emit(0)
if op is NOT_LITERAL:
- emit(OPCODES[NEGATE])
+ emit(NEGATE)
for k in (lo,) + fixes[lo]:
- emit(OPCODES[LITERAL])
+ emit(LITERAL)
emit(k)
- emit(OPCODES[FAILURE])
+ emit(FAILURE)
code[skip] = _len(code) - skip
else:
- emit(OPCODES[OP_IGNORE[op]])
+ emit(OP_IGNORE[op])
emit(lo)
else:
- emit(OPCODES[op])
+ emit(op)
emit(av)
elif op is IN:
if flags & SRE_FLAG_IGNORECASE:
- emit(OPCODES[OP_IGNORE[op]])
+ emit(OP_IGNORE[op])
def fixup(literal, flags=flags):
return _sre.getlower(literal, flags)
else:
- emit(OPCODES[op])
+ emit(op)
fixup = None
skip = _len(code); emit(0)
_compile_charset(av, flags, code, fixup, fixes)
code[skip] = _len(code) - skip
elif op is ANY:
if flags & SRE_FLAG_DOTALL:
- emit(OPCODES[ANY_ALL])
+ emit(ANY_ALL)
else:
- emit(OPCODES[ANY])
+ emit(ANY)
elif op in REPEATING_CODES:
if flags & SRE_FLAG_TEMPLATE:
- raise error("internal: unsupported template operator")
+ raise error("internal: unsupported template operator %r" % (op,))
elif _simple(av) and op is not REPEAT:
if op is MAX_REPEAT:
- emit(OPCODES[REPEAT_ONE])
+ emit(REPEAT_ONE)
else:
- emit(OPCODES[MIN_REPEAT_ONE])
+ emit(MIN_REPEAT_ONE)
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
- emit(OPCODES[SUCCESS])
+ emit(SUCCESS)
code[skip] = _len(code) - skip
else:
- emit(OPCODES[REPEAT])
+ emit(REPEAT)
skip = _len(code); emit(0)
emit(av[0])
emit(av[1])
_compile(code, av[2], flags)
code[skip] = _len(code) - skip
if op is MAX_REPEAT:
- emit(OPCODES[MAX_UNTIL])
+ emit(MAX_UNTIL)
else:
- emit(OPCODES[MIN_UNTIL])
+ emit(MIN_UNTIL)
elif op is SUBPATTERN:
if av[0]:
- emit(OPCODES[MARK])
+ emit(MARK)
emit((av[0]-1)*2)
# _compile_info(code, av[1], flags)
_compile(code, av[1], flags)
if av[0]:
- emit(OPCODES[MARK])
+ emit(MARK)
emit((av[0]-1)*2+1)
elif op in SUCCESS_CODES:
- emit(OPCODES[op])
+ emit(op)
elif op in ASSERT_CODES:
- emit(OPCODES[op])
+ emit(op)
skip = _len(code); emit(0)
if av[0] >= 0:
emit(0) # look ahead
@@ -164,57 +158,57 @@ def _compile(code, pattern, flags):
raise error("look-behind requires fixed-width pattern")
emit(lo) # look behind
_compile(code, av[1], flags)
- emit(OPCODES[SUCCESS])
+ emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is CALL:
- emit(OPCODES[op])
+ emit(op)
skip = _len(code); emit(0)
_compile(code, av, flags)
- emit(OPCODES[SUCCESS])
+ emit(SUCCESS)
code[skip] = _len(code) - skip
elif op is AT:
- emit(OPCODES[op])
+ emit(op)
if flags & SRE_FLAG_MULTILINE:
av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av)
elif flags & SRE_FLAG_UNICODE:
av = AT_UNICODE.get(av, av)
- emit(ATCODES[av])
+ emit(av)
elif op is BRANCH:
- emit(OPCODES[op])
+ emit(op)
tail = []
tailappend = tail.append
for av in av[1]:
skip = _len(code); emit(0)
# _compile_info(code, av, flags)
_compile(code, av, flags)
- emit(OPCODES[JUMP])
+ emit(JUMP)
tailappend(_len(code)); emit(0)
code[skip] = _len(code) - skip
- emit(0) # end of branch
+ emit(FAILURE) # end of branch
for tail in tail:
code[tail] = _len(code) - tail
elif op is CATEGORY:
- emit(OPCODES[op])
+ emit(op)
if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av]
elif flags & SRE_FLAG_UNICODE:
av = CH_UNICODE[av]
- emit(CHCODES[av])
+ emit(av)
elif op is GROUPREF:
if flags & SRE_FLAG_IGNORECASE:
- emit(OPCODES[OP_IGNORE[op]])
+ emit(OP_IGNORE[op])
else:
- emit(OPCODES[op])
+ emit(op)
emit(av-1)
elif op is GROUPREF_EXISTS:
- emit(OPCODES[op])
+ emit(op)
emit(av[0]-1)
skipyes = _len(code); emit(0)
_compile(code, av[1], flags)
if av[2]:
- emit(OPCODES[JUMP])
+ emit(JUMP)
skipno = _len(code); emit(0)
code[skipyes] = _len(code) - skipyes + 1
_compile(code, av[2], flags)
@@ -222,19 +216,18 @@ def _compile(code, pattern, flags):
else:
code[skipyes] = _len(code) - skipyes + 1
else:
- raise ValueError("unsupported operand type", op)
+ raise error("internal: unsupported operand type %r" % (op,))
def _compile_charset(charset, flags, code, fixup=None, fixes=None):
# compile charset subprogram
emit = code.append
- for op, av in _optimize_charset(charset, fixup, fixes,
- flags & SRE_FLAG_UNICODE):
- emit(OPCODES[op])
+ for op, av in _optimize_charset(charset, fixup, fixes):
+ emit(op)
if op is NEGATE:
pass
elif op is LITERAL:
emit(av)
- elif op is RANGE:
+ elif op is RANGE or op is RANGE_IGNORE:
emit(av[0])
emit(av[1])
elif op is CHARSET:
@@ -243,16 +236,16 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
code.extend(av)
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
- emit(CHCODES[CH_LOCALE[av]])
+ emit(CH_LOCALE[av])
elif flags & SRE_FLAG_UNICODE:
- emit(CHCODES[CH_UNICODE[av]])
+ emit(CH_UNICODE[av])
else:
- emit(CHCODES[av])
+ emit(av)
else:
- raise error("internal: unsupported set operator")
- emit(OPCODES[FAILURE])
+ raise error("internal: unsupported set operator %r" % (op,))
+ emit(FAILURE)
-def _optimize_charset(charset, fixup, fixes, isunicode):
+def _optimize_charset(charset, fixup, fixes):
# internal: optimize character set
out = []
tail = []
@@ -262,10 +255,10 @@ def _optimize_charset(charset, fixup, fixes, isunicode):
try:
if op is LITERAL:
if fixup:
- i = fixup(av)
- charmap[i] = 1
- if fixes and i in fixes:
- for k in fixes[i]:
+ lo = fixup(av)
+ charmap[lo] = 1
+ if fixes and lo in fixes:
+ for k in fixes[lo]:
charmap[k] = 1
else:
charmap[av] = 1
@@ -291,21 +284,13 @@ def _optimize_charset(charset, fixup, fixes, isunicode):
# character set contains non-UCS1 character codes
charmap += b'\0' * 0xff00
continue
- # character set contains non-BMP character codes
- if fixup and isunicode and op is RANGE:
- lo, hi = av
- ranges = [av]
- # There are only two ranges of cased astral characters:
- # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi).
- _fixup_range(max(0x10000, lo), min(0x11fff, hi),
- ranges, fixup)
- for lo, hi in ranges:
- if lo == hi:
- tail.append((LITERAL, hi))
- else:
- tail.append((RANGE, (lo, hi)))
- else:
- tail.append((op, av))
+ # Character set contains non-BMP character codes.
+ # There are only two ranges of cased non-BMP characters:
+ # 10400-1044F (Deseret) and 118A0-118DF (Warang Citi),
+ # and for both ranges RANGE_IGNORE works.
+ if fixup and op is RANGE:
+ op = RANGE_IGNORE
+ tail.append((op, av))
break
# compress character map
@@ -383,25 +368,8 @@ def _optimize_charset(charset, fixup, fixes, isunicode):
out += tail
return out
-def _fixup_range(lo, hi, ranges, fixup):
- for i in map(fixup, range(lo, hi+1)):
- for k, (lo, hi) in enumerate(ranges):
- if i < lo:
- if l == lo - 1:
- ranges[k] = (i, hi)
- else:
- ranges.insert(k, (i, i))
- break
- elif i > hi:
- if i == hi + 1:
- ranges[k] = (lo, i)
- break
- else:
- break
- else:
- ranges.append((i, i))
-
_CODEBITS = _sre.CODESIZE * 8
+MAXCODE = (1 << _CODEBITS) - 1
_BITS_TRANS = b'0' + b'1' * 255
def _mk_bitmap(bits, _CODEBITS=_CODEBITS, _int=int):
s = bits.translate(_BITS_TRANS)[::-1]
@@ -446,8 +414,11 @@ def _compile_info(code, pattern, flags):
# this contains min/max pattern width, and an optional literal
# prefix or a character map
lo, hi = pattern.getwidth()
+ if hi > MAXCODE:
+ hi = MAXCODE
if lo == 0:
- return # not worth it
+ code.extend([INFO, 4, 0, lo, hi])
+ return
# look for a literal prefix
prefix = []
prefixappend = prefix.append
@@ -505,21 +476,21 @@ def _compile_info(code, pattern, flags):
elif op is IN:
charset = av
## if prefix:
-## print "*** PREFIX", prefix, prefix_skip
+## print("*** PREFIX", prefix, prefix_skip)
## if charset:
-## print "*** CHARSET", charset
+## print("*** CHARSET", charset)
# add an info block
emit = code.append
- emit(OPCODES[INFO])
+ emit(INFO)
skip = len(code); emit(0)
# literal flag
mask = 0
if prefix:
mask = SRE_INFO_PREFIX
if len(prefix) == prefix_skip == len(pattern.data):
- mask = mask + SRE_INFO_LITERAL
+ mask = mask | SRE_INFO_LITERAL
elif charset:
- mask = mask + SRE_INFO_CHARSET
+ mask = mask | SRE_INFO_CHARSET
emit(mask)
# pattern length
if lo < MAXCODE:
@@ -527,10 +498,7 @@ def _compile_info(code, pattern, flags):
else:
emit(MAXCODE)
prefix = prefix[:MAXCODE]
- if hi < MAXCODE:
- emit(hi)
- else:
- emit(0)
+ emit(min(hi, MAXCODE))
# add literal prefix
if prefix:
emit(len(prefix)) # length
@@ -556,7 +524,7 @@ def _code(p, flags):
# compile the pattern
_compile(code, p.data, flags)
- code.append(OPCODES[SUCCESS])
+ code.append(SUCCESS)
return code
@@ -571,13 +539,7 @@ def compile(p, flags=0):
code = _code(p, flags)
- # print code
-
- # XXX: <fl> get rid of this limitation!
- if p.pattern.groups > 100:
- raise AssertionError(
- "sorry, but this version only supports 100 named groups"
- )
+ # print(code)
# map in either direction
groupindex = p.pattern.groupdict