diff options
Diffstat (limited to 'Lib/sre_compile.py')
-rw-r--r-- | Lib/sre_compile.py | 140 |
1 files changed, 83 insertions, 57 deletions
diff --git a/Lib/sre_compile.py b/Lib/sre_compile.py index 502b0616c6..420d83de63 100644 --- a/Lib/sre_compile.py +++ b/Lib/sre_compile.py @@ -71,7 +71,8 @@ def _compile(code, pattern, flags): ASSERT_CODES = _ASSERT_CODES if (flags & SRE_FLAG_IGNORECASE and not (flags & SRE_FLAG_LOCALE) and - flags & SRE_FLAG_UNICODE): + flags & SRE_FLAG_UNICODE and + not (flags & SRE_FLAG_ASCII)): fixes = _ignorecase_fixes else: fixes = None @@ -137,14 +138,15 @@ def _compile(code, pattern, flags): else: emit(MIN_UNTIL) elif op is SUBPATTERN: - if av[0]: + group, add_flags, del_flags, p = av + if group: emit(MARK) - emit((av[0]-1)*2) - # _compile_info(code, av[1], flags) - _compile(code, av[1], flags) - if av[0]: + emit((group-1)*2) + # _compile_info(code, p, (flags | add_flags) & ~del_flags) + _compile(code, p, (flags | add_flags) & ~del_flags) + if group: emit(MARK) - emit((av[0]-1)*2+1) + emit((group-1)*2+1) elif op in SUCCESS_CODES: emit(op) elif op in ASSERT_CODES: @@ -172,7 +174,7 @@ def _compile(code, pattern, flags): av = AT_MULTILINE.get(av, av) if flags & SRE_FLAG_LOCALE: av = AT_LOCALE.get(av, av) - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): av = AT_UNICODE.get(av, av) emit(av) elif op is BRANCH: @@ -193,7 +195,7 @@ def _compile(code, pattern, flags): emit(op) if flags & SRE_FLAG_LOCALE: av = CH_LOCALE[av] - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): av = CH_UNICODE[av] emit(av) elif op is GROUPREF: @@ -237,7 +239,7 @@ def _compile_charset(charset, flags, code, fixup=None, fixes=None): elif op is CATEGORY: if flags & SRE_FLAG_LOCALE: emit(CH_LOCALE[av]) - elif flags & SRE_FLAG_UNICODE: + elif (flags & SRE_FLAG_UNICODE) and not (flags & SRE_FLAG_ASCII): emit(CH_UNICODE[av]) else: emit(av) @@ -409,42 +411,42 @@ def _generate_overlap_table(prefix): table[i] = idx + 1 return table -def _compile_info(code, pattern, flags): - # internal: compile an info block. in the current version, - # this contains min/max pattern width, and an optional literal - # prefix or a character map - lo, hi = pattern.getwidth() - if hi > MAXCODE: - hi = MAXCODE - if lo == 0: - code.extend([INFO, 4, 0, lo, hi]) - return - # look for a literal prefix +def _get_literal_prefix(pattern): + # look for literal prefix prefix = [] prefixappend = prefix.append - prefix_skip = 0 + prefix_skip = None + for op, av in pattern.data: + if op is LITERAL: + prefixappend(av) + elif op is SUBPATTERN: + group, add_flags, del_flags, p = av + if add_flags & SRE_FLAG_IGNORECASE: + break + prefix1, prefix_skip1, got_all = _get_literal_prefix(p) + if prefix_skip is None: + if group is not None: + prefix_skip = len(prefix) + elif prefix_skip1 is not None: + prefix_skip = len(prefix) + prefix_skip1 + prefix.extend(prefix1) + if not got_all: + break + else: + break + else: + return prefix, prefix_skip, True + return prefix, prefix_skip, False + +def _get_charset_prefix(pattern): charset = [] # not used charsetappend = charset.append - if not (flags & SRE_FLAG_IGNORECASE): - # look for literal prefix - for op, av in pattern.data: - if op is LITERAL: - if len(prefix) == prefix_skip: - prefix_skip = prefix_skip + 1 - prefixappend(av) - elif op is SUBPATTERN and len(av[1]) == 1: - op, av = av[1][0] - if op is LITERAL: - prefixappend(av) - else: - break - else: - break - # if no prefix, look for charset prefix - if not prefix and pattern.data: - op, av = pattern.data[0] - if op is SUBPATTERN and av[1]: - op, av = av[1][0] + if pattern.data: + op, av = pattern.data[0] + if op is SUBPATTERN: + group, add_flags, del_flags, p = av + if p and not (add_flags & SRE_FLAG_IGNORECASE): + op, av = p[0] if op is LITERAL: charsetappend((op, av)) elif op is BRANCH: @@ -460,21 +462,43 @@ def _compile_info(code, pattern, flags): break else: charset = c - elif op is BRANCH: - c = [] - cappend = c.append - for p in av[1]: - if not p: - break - op, av = p[0] - if op is LITERAL: - cappend((op, av)) - else: - break + elif op is BRANCH: + c = [] + cappend = c.append + for p in av[1]: + if not p: + break + op, av = p[0] + if op is LITERAL: + cappend((op, av)) else: - charset = c - elif op is IN: - charset = av + break + else: + charset = c + elif op is IN: + charset = av + return charset + +def _compile_info(code, pattern, flags): + # internal: compile an info block. in the current version, + # this contains min/max pattern width, and an optional literal + # prefix or a character map + lo, hi = pattern.getwidth() + if hi > MAXCODE: + hi = MAXCODE + if lo == 0: + code.extend([INFO, 4, 0, lo, hi]) + return + # look for a literal prefix + prefix = [] + prefix_skip = 0 + charset = [] # not used + if not (flags & SRE_FLAG_IGNORECASE): + # look for literal prefix + prefix, prefix_skip, got_all = _get_literal_prefix(pattern) + # if no prefix, look for charset prefix + if not prefix: + charset = _get_charset_prefix(pattern) ## if prefix: ## print("*** PREFIX", prefix, prefix_skip) ## if charset: @@ -487,7 +511,7 @@ def _compile_info(code, pattern, flags): mask = 0 if prefix: mask = SRE_INFO_PREFIX - if len(prefix) == prefix_skip == len(pattern.data): + if prefix_skip is None and got_all: mask = mask | SRE_INFO_LITERAL elif charset: mask = mask | SRE_INFO_CHARSET @@ -502,6 +526,8 @@ def _compile_info(code, pattern, flags): # add literal prefix if prefix: emit(len(prefix)) # length + if prefix_skip is None: + prefix_skip = len(prefix) emit(prefix_skip) # skip code.extend(prefix) # generate overlap table |