Add module to optimize regexes that consist of a long |-separated list of literals.

author: Georg Brandl <georg@python.org> 2014-09-16 14:06:54 +0200
committer: Georg Brandl <georg@python.org> 2014-09-16 14:06:54 +0200
commit: 4ebcf72d1a077c29d94a0cefce3f068ce41a37eb (patch)
tree: 887c6378b170a1fb00a252d929738cb227f99967
parent: 5e5586a698e82c7b596ab2e47f035d2aa941b400 (diff)
download: pygments-4ebcf72d1a077c29d94a0cefce3f068ce41a37eb.tar.gz
4 files changed, 157 insertions, 3 deletions
diff --git a/CHANGES b/CHANGES
index 2f47139b..08c50ee2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -43,6 +43,10 @@ Version 2.0
   * APL (#969)
   * Nit (PR#375)
 
+- Added a helper to "optimize" regular expressions that match one of many
+  literal words; this can save 20% and more lexing time with lexers that
+  highlight many keywords or builtins.
+
 - New styles: "xcode" and "igor", similar to the default highlighting of
   the respective IDEs.
 
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 5214d43e..f3543d41 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -17,11 +17,11 @@ from pygments.filters import get_filter_by_name
 from pygments.token import Error, Text, Other, _TokenType
 from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \
     make_analysator, text_type, add_metaclass, iteritems
-
+from pygments.regexopt import regex_opt
 
 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
            'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this',
-           'default']
+           'default', 'words']
 
 
 _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'),
@@ -390,12 +390,27 @@ class default:
     """
     Indicates a state or state action (e.g. #pop) to apply.
     For example default('#pop') is equivalent to ('', Token, '#pop')
-    Note that state tuples may be used as well
+    Note that state tuples may be used as well.
+
+    .. versionadded:: 2.0
     """
     def __init__(self, state):
         self.state = state
 
 
+class words:
+    """
+    Indicates a list of literal words that is transformed into an optimized
+    regex that matches any of the words.
+
+    .. versionadded:: 2.0
+    """
+    def __init__(self, words, prefix='', suffix=''):
+        self.words = words
+        self.prefix = prefix
+        self.suffix = suffix
+
+
 class RegexLexerMeta(LexerMeta):
     """
     Metaclass for RegexLexer, creates the self._tokens attribute from
@@ -404,6 +419,9 @@ class RegexLexerMeta(LexerMeta):
 
     def _process_regex(cls, regex, rflags):
         """Preprocess the regular expression component of a token definition."""
+        if isinstance(regex, words):
+            return regex_opt(regex.words, rflags, prefix=regex.prefix,
+                             suffix=regex.suffix).match
         return re.compile(regex, rflags).match
 
     def _process_token(cls, token):
diff --git a/pygments/regexopt.py b/pygments/regexopt.py
new file mode 100644
index 00000000..cc924ea0
--- /dev/null
+++ b/pygments/regexopt.py
@@ -0,0 +1,93 @@
+# -*- coding: utf-8 -*-
+"""
+    pygments.regexopt
+    ~~~~~~~~~~~~~~~~~
+
+    An algorithm that generates optimized regexes for matching long lists of
+    literal strings.
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import re
+from re import escape
+from os.path import commonprefix
+from itertools import groupby
+from operator import itemgetter
+
+CS_ESCAPE = re.compile(r'[\^\\\-\]]')
+FIRST_ELEMENT = itemgetter(0)
+
+
+def make_charset(letters):
+    return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']'
+
+
+def regex_opt_inner(strings, open_paren):
+    """Return a regex that matches any string in the sorted list of strings."""
+    close_paren = open_paren and ')' or ''
+    # print strings, repr(open_paren)
+    if not strings:
+        # print '-> nothing left'
+        return ''
+    first = strings[0]
+    if len(strings) == 1:
+        # print '-> only 1 string'
+        return open_paren + escape(first) + close_paren
+    if not first:
+        # print '-> first string empty'
+        return open_paren + regex_opt_inner(strings[1:], '(?:') \
+            + '?' + close_paren
+    if len(first) == 1:
+        # multiple one-char strings? make a charset
+        oneletter = []
+        rest = []
+        for s in strings:
+            if len(s) == 1:
+                oneletter.append(s)
+            else:
+                rest.append(s)
+        if len(oneletter) > 1:  # do we have more than one oneletter string?
+            if rest:
+                # print '-> 1-character + rest'
+                return open_paren + regex_opt_inner(rest, '') + '|' \
+                    + make_charset(oneletter) + close_paren
+            # print '-> only 1-character'
+            return make_charset(oneletter)
+    prefix = commonprefix(strings)
+    if prefix:
+        plen = len(prefix)
+        # we have a prefix for all strings
+        # print '-> prefix:', prefix
+        return open_paren + escape(prefix) \
+            + regex_opt_inner([s[plen:] for s in strings], '(?:') \
+            + close_paren
+    # is there a suffix?
+    strings_rev = [s[::-1] for s in strings]
+    suffix = commonprefix(strings_rev)
+    if suffix:
+        slen = len(suffix)
+        # print '-> suffix:', suffix[::-1]
+        return open_paren \
+            + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \
+            + escape(suffix[::-1]) + close_paren
+    # recurse on common 1-string prefixes
+    # print '-> last resort'
+    return open_paren + \
+        '|'.join(regex_opt_inner(list(group[1]), '')
+                 for group in groupby(strings, lambda s: s[0] == first[0])) \
+        + close_paren
+
+
+def regex_opt(strings, flags=0, prefix='', suffix=''):
+    """Return a compiled regex that matches any string in the given list.
+
+    The strings to match must be literal strings, not regexes.  They will be
+    regex-escaped.
+
+    *prefix* and *suffix* are pre- and appended to the final regex.
+    """
+    strings = sorted(strings)
+    rex = prefix + regex_opt_inner(strings, '(') + suffix
+    return re.compile(rex, flags)
diff --git a/tests/test_regexopt.py b/tests/test_regexopt.py
new file mode 100644
index 00000000..5dc8f9af
--- /dev/null
+++ b/tests/test_regexopt.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+"""
+    Tests for pygments.regexopt
+    ~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+    :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+"""
+
+import random
+import unittest
+import itertools
+
+from pygments.regexopt import regex_opt
+
+ALPHABET = ['a', 'b', 'c', 'd', 'e']
+N_TRIES = 15
+
+
+class RegexOptTestCase(unittest.TestCase):
+
+    def generate_keywordlist(self, length):
+        return [''.join(p) for p in
+                itertools.combinations_with_replacement(ALPHABET, length)]
+
+    def test_randomly(self):
+        # generate a list of all possible keywords of a certain length using
+        # a restricted alphabet, then choose some to match and make sure only
+        # those do
+        for n in range(3, N_TRIES):
+            kwlist = self.generate_keywordlist(n)
+            to_match = random.sample(kwlist,
+                                     random.randint(1, len(kwlist) - 1))
+            no_match = set(kwlist) - set(to_match)
+            rex = regex_opt(to_match, True)
+            for w in to_match:
+                self.assertTrue(rex.match(w))
+            for w in no_match:
+                self.assertFalse(rex.match(w))
author	Georg Brandl <georg@python.org>	2014-09-16 14:06:54 +0200
committer	Georg Brandl <georg@python.org>	2014-09-16 14:06:54 +0200
commit	4ebcf72d1a077c29d94a0cefce3f068ce41a37eb (patch)
tree	887c6378b170a1fb00a252d929738cb227f99967
parent	5e5586a698e82c7b596ab2e47f035d2aa941b400 (diff)
download	pygments-4ebcf72d1a077c29d94a0cefce3f068ce41a37eb.tar.gz