Merged in __russ__/pygments-main (pull request #165)

author: Georg Brandl <georg@python.org> 2014-10-08 08:50:24 +0200
committer: Georg Brandl <georg@python.org> 2014-10-08 08:50:24 +0200
commit: ab509e4ea2a8bd3c7e8e355b0e83b3e2de9f7a01 (patch)
tree: db1c94d9d2ba3fc0c664b71ba798007eb0da5a65 /pygments/regexopt.py
parent: 7f5c98a36c3a8e1b9877e1d4cfe41fd00f08833a (diff)
parent: e07ba8bf31d7a9ee2cfd4832608a9453a9f81fbe (diff)
download: pygments-ab509e4ea2a8bd3c7e8e355b0e83b3e2de9f7a01.tar.gz
1 files changed, 92 insertions, 0 deletions
diff --git a/pygments/regexopt.py b/pygments/regexopt.py
new file mode 100644
index 00000000..ec048309
--- /dev/null
+++ b/pygments/regexopt.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""
+    pygments.regexopt
+    ~~~~~~~~~~~~~~~~~
+
+    An algorithm that generates optimized regexes for matching long lists of
+    literal strings.
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+from re import escape
+from os.path import commonprefix
+from itertools import groupby
+from operator import itemgetter
+
+CS_ESCAPE = re.compile(r'[\^\\\-\]]')
+FIRST_ELEMENT = itemgetter(0)
+
+
+def make_charset(letters):
+    return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
+
+
+def regex_opt_inner(strings, open_paren):
+    """Return a regex that matches any string in the sorted list of strings."""
+    close_paren = open_paren and ')' or ''
+    # print strings, repr(open_paren)
+    if not strings:
+        # print '-> nothing left'
+        return ''
+    first = strings[0]
+    if len(strings) == 1:
+        # print '-> only 1 string'
+        return open_paren + escape(first) + close_paren
+    if not first:
+        # print '-> first string empty'
+        return open_paren + regex_opt_inner(strings[1:], '(?:') \
+            + '?' + close_paren
+    if len(first) == 1:
+        # multiple one-char strings? make a charset
+        oneletter = []
+        rest = []
+        for s in strings:
+            if len(s) == 1:
+                oneletter.append(s)
+            else:
+                rest.append(s)
+        if len(oneletter) > 1:  # do we have more than one oneletter string?
+            if rest:
+                # print '-> 1-character + rest'
+                return open_paren + regex_opt_inner(rest, '') + '|' \
+                    + make_charset(oneletter) + close_paren
+            # print '-> only 1-character'
+            return make_charset(oneletter)
+    prefix = commonprefix(strings)
+    if prefix:
+        plen = len(prefix)
+        # we have a prefix for all strings
+        # print '-> prefix:', prefix
+        return open_paren + escape(prefix) \
+            + regex_opt_inner([s[plen:] for s in strings], '(?:') \
+            + close_paren
+    # is there a suffix?
+    strings_rev = [s[::-1] for s in strings]
+    suffix = commonprefix(strings_rev)
+    if suffix:
+        slen = len(suffix)
+        # print '-> suffix:', suffix[::-1]
+        return open_paren \
+            + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
+            + escape(suffix[::-1]) + close_paren
+    # recurse on common 1-string prefixes
+    # print '-> last resort'
+    return open_paren + \
+        '|'.join(regex_opt_inner(list(group[1]), '')
+                 for group in groupby(strings, lambda s: s[0] == first[0])) \
+        + close_paren
+
+
+def regex_opt(strings, prefix='', suffix=''):
+    """Return a compiled regex that matches any string in the given list.
+
+    The strings to match must be literal strings, not regexes.  They will be
+    regex-escaped.
+
+    *prefix* and *suffix* are pre- and appended to the final regex.
+    """
+    strings = sorted(strings)
+    return prefix + regex_opt_inner(strings, '(') + suffix
author	Georg Brandl <georg@python.org>	2014-10-08 08:50:24 +0200
committer	Georg Brandl <georg@python.org>	2014-10-08 08:50:24 +0200
commit	ab509e4ea2a8bd3c7e8e355b0e83b3e2de9f7a01 (patch)
tree	db1c94d9d2ba3fc0c664b71ba798007eb0da5a65 /pygments/regexopt.py
parent	7f5c98a36c3a8e1b9877e1d4cfe41fd00f08833a (diff)
parent	e07ba8bf31d7a9ee2cfd4832608a9453a9f81fbe (diff)
download	pygments-ab509e4ea2a8bd3c7e8e355b0e83b3e2de9f7a01.tar.gz