diff options
author | Georg Brandl <georg@python.org> | 2014-10-08 08:50:24 +0200 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2014-10-08 08:50:24 +0200 |
commit | ab509e4ea2a8bd3c7e8e355b0e83b3e2de9f7a01 (patch) | |
tree | db1c94d9d2ba3fc0c664b71ba798007eb0da5a65 /pygments/regexopt.py | |
parent | 7f5c98a36c3a8e1b9877e1d4cfe41fd00f08833a (diff) | |
parent | e07ba8bf31d7a9ee2cfd4832608a9453a9f81fbe (diff) | |
download | pygments-ab509e4ea2a8bd3c7e8e355b0e83b3e2de9f7a01.tar.gz |
Merged in __russ__/pygments-main (pull request #165)
Diffstat (limited to 'pygments/regexopt.py')
-rw-r--r-- | pygments/regexopt.py | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/pygments/regexopt.py b/pygments/regexopt.py new file mode 100644 index 00000000..ec048309 --- /dev/null +++ b/pygments/regexopt.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" + pygments.regexopt + ~~~~~~~~~~~~~~~~~ + + An algorithm that generates optimized regexes for matching long lists of + literal strings. + + :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import re +from re import escape +from os.path import commonprefix +from itertools import groupby +from operator import itemgetter + +CS_ESCAPE = re.compile(r'[\^\\\-\]]') +FIRST_ELEMENT = itemgetter(0) + + +def make_charset(letters): + return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' + + +def regex_opt_inner(strings, open_paren): + """Return a regex that matches any string in the sorted list of strings.""" + close_paren = open_paren and ')' or '' + # print strings, repr(open_paren) + if not strings: + # print '-> nothing left' + return '' + first = strings[0] + if len(strings) == 1: + # print '-> only 1 string' + return open_paren + escape(first) + close_paren + if not first: + # print '-> first string empty' + return open_paren + regex_opt_inner(strings[1:], '(?:') \ + + '?' + close_paren + if len(first) == 1: + # multiple one-char strings? make a charset + oneletter = [] + rest = [] + for s in strings: + if len(s) == 1: + oneletter.append(s) + else: + rest.append(s) + if len(oneletter) > 1: # do we have more than one oneletter string? + if rest: + # print '-> 1-character + rest' + return open_paren + regex_opt_inner(rest, '') + '|' \ + + make_charset(oneletter) + close_paren + # print '-> only 1-character' + return make_charset(oneletter) + prefix = commonprefix(strings) + if prefix: + plen = len(prefix) + # we have a prefix for all strings + # print '-> prefix:', prefix + return open_paren + escape(prefix) \ + + regex_opt_inner([s[plen:] for s in strings], '(?:') \ + + close_paren + # is there a suffix? + strings_rev = [s[::-1] for s in strings] + suffix = commonprefix(strings_rev) + if suffix: + slen = len(suffix) + # print '-> suffix:', suffix[::-1] + return open_paren \ + + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ + + escape(suffix[::-1]) + close_paren + # recurse on common 1-string prefixes + # print '-> last resort' + return open_paren + \ + '|'.join(regex_opt_inner(list(group[1]), '') + for group in groupby(strings, lambda s: s[0] == first[0])) \ + + close_paren + + +def regex_opt(strings, prefix='', suffix=''): + """Return a compiled regex that matches any string in the given list. + + The strings to match must be literal strings, not regexes. They will be + regex-escaped. + + *prefix* and *suffix* are pre- and appended to the final regex. + """ + strings = sorted(strings) + return prefix + regex_opt_inner(strings, '(') + suffix |