diff options
author | Georg Brandl <georg@python.org> | 2014-09-16 14:06:54 +0200 |
---|---|---|
committer | Georg Brandl <georg@python.org> | 2014-09-16 14:06:54 +0200 |
commit | 4ebcf72d1a077c29d94a0cefce3f068ce41a37eb (patch) | |
tree | 887c6378b170a1fb00a252d929738cb227f99967 | |
parent | 5e5586a698e82c7b596ab2e47f035d2aa941b400 (diff) | |
download | pygments-4ebcf72d1a077c29d94a0cefce3f068ce41a37eb.tar.gz |
Add module to optimize regexes that consist of a long |-separated list of literals.
-rw-r--r-- | CHANGES | 4 | ||||
-rw-r--r-- | pygments/lexer.py | 24 | ||||
-rw-r--r-- | pygments/regexopt.py | 93 | ||||
-rw-r--r-- | tests/test_regexopt.py | 39 |
4 files changed, 157 insertions, 3 deletions
@@ -43,6 +43,10 @@ Version 2.0 * APL (#969) * Nit (PR#375) +- Added a helper to "optimize" regular expressions that match one of many + literal words; this can save 20% and more lexing time with lexers that + highlight many keywords or builtins. + - New styles: "xcode" and "igor", similar to the default highlighting of the respective IDEs. diff --git a/pygments/lexer.py b/pygments/lexer.py index 5214d43e..f3543d41 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -17,11 +17,11 @@ from pygments.filters import get_filter_by_name from pygments.token import Error, Text, Other, _TokenType from pygments.util import get_bool_opt, get_int_opt, get_list_opt, \ make_analysator, text_type, add_metaclass, iteritems - +from pygments.regexopt import regex_opt __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 'LexerContext', 'include', 'inherit', 'bygroups', 'using', 'this', - 'default'] + 'default', 'words'] _encoding_map = [(b'\xef\xbb\xbf', 'utf-8'), @@ -390,12 +390,27 @@ class default: """ Indicates a state or state action (e.g. #pop) to apply. For example default('#pop') is equivalent to ('', Token, '#pop') - Note that state tuples may be used as well + Note that state tuples may be used as well. + + .. versionadded:: 2.0 """ def __init__(self, state): self.state = state +class words: + """ + Indicates a list of literal words that is transformed into an optimized + regex that matches any of the words. + + .. versionadded:: 2.0 + """ + def __init__(self, words, prefix='', suffix=''): + self.words = words + self.prefix = prefix + self.suffix = suffix + + class RegexLexerMeta(LexerMeta): """ Metaclass for RegexLexer, creates the self._tokens attribute from @@ -404,6 +419,9 @@ class RegexLexerMeta(LexerMeta): def _process_regex(cls, regex, rflags): """Preprocess the regular expression component of a token definition.""" + if isinstance(regex, words): + return regex_opt(regex.words, rflags, prefix=regex.prefix, + suffix=regex.suffix).match return re.compile(regex, rflags).match def _process_token(cls, token): diff --git a/pygments/regexopt.py b/pygments/regexopt.py new file mode 100644 index 00000000..cc924ea0 --- /dev/null +++ b/pygments/regexopt.py @@ -0,0 +1,93 @@ +# -*- coding: utf-8 -*- +""" + pygments.regexopt + ~~~~~~~~~~~~~~~~~ + + An algorithm that generates optimized regexes for matching long lists of + literal strings. + + :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import re +from re import escape +from os.path import commonprefix +from itertools import groupby +from operator import itemgetter + +CS_ESCAPE = re.compile(r'[\^\\\-\]]') +FIRST_ELEMENT = itemgetter(0) + + +def make_charset(letters): + return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' + + +def regex_opt_inner(strings, open_paren): + """Return a regex that matches any string in the sorted list of strings.""" + close_paren = open_paren and ')' or '' + # print strings, repr(open_paren) + if not strings: + # print '-> nothing left' + return '' + first = strings[0] + if len(strings) == 1: + # print '-> only 1 string' + return open_paren + escape(first) + close_paren + if not first: + # print '-> first string empty' + return open_paren + regex_opt_inner(strings[1:], '(?:') \ + + '?' + close_paren + if len(first) == 1: + # multiple one-char strings? make a charset + oneletter = [] + rest = [] + for s in strings: + if len(s) == 1: + oneletter.append(s) + else: + rest.append(s) + if len(oneletter) > 1: # do we have more than one oneletter string? + if rest: + # print '-> 1-character + rest' + return open_paren + regex_opt_inner(rest, '') + '|' \ + + make_charset(oneletter) + close_paren + # print '-> only 1-character' + return make_charset(oneletter) + prefix = commonprefix(strings) + if prefix: + plen = len(prefix) + # we have a prefix for all strings + # print '-> prefix:', prefix + return open_paren + escape(prefix) \ + + regex_opt_inner([s[plen:] for s in strings], '(?:') \ + + close_paren + # is there a suffix? + strings_rev = [s[::-1] for s in strings] + suffix = commonprefix(strings_rev) + if suffix: + slen = len(suffix) + # print '-> suffix:', suffix[::-1] + return open_paren \ + + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ + + escape(suffix[::-1]) + close_paren + # recurse on common 1-string prefixes + # print '-> last resort' + return open_paren + \ + '|'.join(regex_opt_inner(list(group[1]), '') + for group in groupby(strings, lambda s: s[0] == first[0])) \ + + close_paren + + +def regex_opt(strings, flags=0, prefix='', suffix=''): + """Return a compiled regex that matches any string in the given list. + + The strings to match must be literal strings, not regexes. They will be + regex-escaped. + + *prefix* and *suffix* are pre- and appended to the final regex. + """ + strings = sorted(strings) + rex = prefix + regex_opt_inner(strings, '(') + suffix + return re.compile(rex, flags) diff --git a/tests/test_regexopt.py b/tests/test_regexopt.py new file mode 100644 index 00000000..5dc8f9af --- /dev/null +++ b/tests/test_regexopt.py @@ -0,0 +1,39 @@ +# -*- coding: utf-8 -*- +""" + Tests for pygments.regexopt + ~~~~~~~~~~~~~~~~~~~~~~~~~~~ + + :copyright: Copyright 2006-2014 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import random +import unittest +import itertools + +from pygments.regexopt import regex_opt + +ALPHABET = ['a', 'b', 'c', 'd', 'e'] +N_TRIES = 15 + + +class RegexOptTestCase(unittest.TestCase): + + def generate_keywordlist(self, length): + return [''.join(p) for p in + itertools.combinations_with_replacement(ALPHABET, length)] + + def test_randomly(self): + # generate a list of all possible keywords of a certain length using + # a restricted alphabet, then choose some to match and make sure only + # those do + for n in range(3, N_TRIES): + kwlist = self.generate_keywordlist(n) + to_match = random.sample(kwlist, + random.randint(1, len(kwlist) - 1)) + no_match = set(kwlist) - set(to_match) + rex = regex_opt(to_match, True) + for w in to_match: + self.assertTrue(rex.match(w)) + for w in no_match: + self.assertFalse(rex.match(w)) |