summaryrefslogtreecommitdiff
path: root/Lib/sre_compile.py
diff options
context:
space:
mode:
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r--Lib/sre_compile.py140
1 files changed, 83 insertions, 57 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py
index 502b0616c6..420d83de63 100644
--- a/Lib/sre_compile.py
+++ b/Lib/sre_compile.py
@@ -71,7 +71,8 @@ def _compile(code, pattern, flags):
ASSERT_CODES = _ASSERT_CODES
if (flags & SRE_FLAG_IGNORECASE and
not (flags & SRE_FLAG_LOCALE) and
- flags & SRE_FLAG_UNICODE):
+ flags & SRE_FLAG_UNICODE and
+ not (flags & SRE_FLAG_ASCII)):
fixes = _ignorecase_fixes
else:
fixes = None
@@ -137,14 +138,15 @@ def _compile(code, pattern, flags):
else:
emit(MIN_UNTIL)
elif op is SUBPATTERN:
- if av[0]:
+ group, add_flags, del_flags, p = av
+ if group:
emit(MARK)
- emit((av[0]-1)*2)
- # _compile_info(code, av[1], flags)
- _compile(code, av[1], flags)
- if av[0]:
+ emit((group-1)*2)
+ # _compile_info(code, p, (flags | add_flags) & ~del_flags)
+ _compile(code, p, (flags | add_flags) & ~del_flags)
+ if group:
emit(MARK)
- emit((av[0]-1)*2+1)
+ emit((group-1)*2+1)
elif op in SUCCESS_CODES:
emit(op)
elif op in ASSERT_CODES:
@@ -172,7 +174,7 @@ def _compile(code, pattern, flags):
av = AT_MULTILINE.get(av, av)
if flags & SRE_FLAG_LOCALE:
av = AT_LOCALE.get(av, av)
- elif flags & SRE_FLAG_UNICODE:
+ elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
av = AT_UNICODE.get(av, av)
emit(av)
elif op is BRANCH:
@@ -193,7 +195,7 @@ def _compile(code, pattern, flags):
emit(op)
if flags & SRE_FLAG_LOCALE:
av = CH_LOCALE[av]
- elif flags & SRE_FLAG_UNICODE:
+ elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
av = CH_UNICODE[av]
emit(av)
elif op is GROUPREF:
@@ -237,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None):
elif op is CATEGORY:
if flags & SRE_FLAG_LOCALE:
emit(CH_LOCALE[av])
- elif flags & SRE_FLAG_UNICODE:
+ elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII):
emit(CH_UNICODE[av])
else:
emit(av)
@@ -409,42 +411,42 @@ def _generate_overlap_table(prefix):
table[i] = idx + 1
return table
-def _compile_info(code, pattern, flags):
- # internal: compile an info block. in the current version,
- # this contains min/max pattern width, and an optional literal
- # prefix or a character map
- lo, hi = pattern.getwidth()
- if hi > MAXCODE:
- hi = MAXCODE
- if lo == 0:
- code.extend([INFO, 4, 0, lo, hi])
- return
- # look for a literal prefix
+def _get_literal_prefix(pattern):
+ # look for literal prefix
prefix = []
prefixappend = prefix.append
- prefix_skip = 0
+ prefix_skip = None
+ for op, av in pattern.data:
+ if op is LITERAL:
+ prefixappend(av)
+ elif op is SUBPATTERN:
+ group, add_flags, del_flags, p = av
+ if add_flags & SRE_FLAG_IGNORECASE:
+ break
+ prefix1, prefix_skip1, got_all = _get_literal_prefix(p)
+ if prefix_skip is None:
+ if group is not None:
+ prefix_skip = len(prefix)
+ elif prefix_skip1 is not None:
+ prefix_skip = len(prefix) + prefix_skip1
+ prefix.extend(prefix1)
+ if not got_all:
+ break
+ else:
+ break
+ else:
+ return prefix, prefix_skip, True
+ return prefix, prefix_skip, False
+
+def _get_charset_prefix(pattern):
charset = [] # not used
charsetappend = charset.append
- if not (flags & SRE_FLAG_IGNORECASE):
- # look for literal prefix
- for op, av in pattern.data:
- if op is LITERAL:
- if len(prefix) == prefix_skip:
- prefix_skip = prefix_skip + 1
- prefixappend(av)
- elif op is SUBPATTERN and len(av[1]) == 1:
- op, av = av[1][0]
- if op is LITERAL:
- prefixappend(av)
- else:
- break
- else:
- break
- # if no prefix, look for charset prefix
- if not prefix and pattern.data:
- op, av = pattern.data[0]
- if op is SUBPATTERN and av[1]:
- op, av = av[1][0]
+ if pattern.data:
+ op, av = pattern.data[0]
+ if op is SUBPATTERN:
+ group, add_flags, del_flags, p = av
+ if p and not (add_flags & SRE_FLAG_IGNORECASE):
+ op, av = p[0]
if op is LITERAL:
charsetappend((op, av))
elif op is BRANCH:
@@ -460,21 +462,43 @@ def _compile_info(code, pattern, flags):
break
else:
charset = c
- elif op is BRANCH:
- c = []
- cappend = c.append
- for p in av[1]:
- if not p:
- break
- op, av = p[0]
- if op is LITERAL:
- cappend((op, av))
- else:
- break
+ elif op is BRANCH:
+ c = []
+ cappend = c.append
+ for p in av[1]:
+ if not p:
+ break
+ op, av = p[0]
+ if op is LITERAL:
+ cappend((op, av))
else:
- charset = c
- elif op is IN:
- charset = av
+ break
+ else:
+ charset = c
+ elif op is IN:
+ charset = av
+ return charset
+
+def _compile_info(code, pattern, flags):
+ # internal: compile an info block. in the current version,
+ # this contains min/max pattern width, and an optional literal
+ # prefix or a character map
+ lo, hi = pattern.getwidth()
+ if hi > MAXCODE:
+ hi = MAXCODE
+ if lo == 0:
+ code.extend([INFO, 4, 0, lo, hi])
+ return
+ # look for a literal prefix
+ prefix = []
+ prefix_skip = 0
+ charset = [] # not used
+ if not (flags & SRE_FLAG_IGNORECASE):
+ # look for literal prefix
+ prefix, prefix_skip, got_all = _get_literal_prefix(pattern)
+ # if no prefix, look for charset prefix
+ if not prefix:
+ charset = _get_charset_prefix(pattern)
## if prefix:
## print("*** PREFIX", prefix, prefix_skip)
## if charset:
@@ -487,7 +511,7 @@ def _compile_info(code, pattern, flags):
mask = 0
if prefix:
mask = SRE_INFO_PREFIX
- if len(prefix) == prefix_skip == len(pattern.data):
+ if prefix_skip is None and got_all:
mask = mask | SRE_INFO_LITERAL
elif charset:
mask = mask | SRE_INFO_CHARSET
@@ -502,6 +526,8 @@ def _compile_info(code, pattern, flags):
# add literal prefix
if prefix:
emit(len(prefix)) # length
+ if prefix_skip is None:
+ prefix_skip = len(prefix)
emit(prefix_skip) # skip
code.extend(prefix)
# generate overlap table