diff options
author | Christian Hammond <christian@beanbaginc.com> | 2016-11-04 16:57:38 -0700 |
---|---|---|
committer | Christian Hammond <christian@beanbaginc.com> | 2016-11-04 16:57:38 -0700 |
commit | 6ded9db39463372e5205a36bea72d6de516ece69 (patch) | |
tree | 1d1f497cc99dd44d2ee7e2c3daa35965157ff924 /pygments/regexopt.py | |
download | pygments-git-6ded9db39463372e5205a36bea72d6de516ece69.tar.gz |
Add support for partials and path segments for Handlebars.
This introduces support for some missing features to the Handlebars lexer:
Partials and path segments. Partials mostly appeared to work before, but the
`>` in `{{> ... }}` would appear as a syntax error, as could other
components of the partial. This change introduces support for:
* Standard partials: `{{> partialName}}`
* Partials with parameters: `{{> partialName varname="value"}}`
* Ddynamic partials: `{{> (partialFunc)}}`
* Ddynamic partials with lookups: `{{> (lookup ../path "partialName")}}`
* Partial blocks: `{{> @partial-block}}`
* Inline partials: `{{#*inline}}..{{/inline}}`
It also introduces support for path segments, which can reference content in
the current context or in a parent context. For instance, `this.name`,
`this/name`, `./name`, `../name`, `this/name`, etc. These are all now tracked
as variables.
Diffstat (limited to 'pygments/regexopt.py')
-rw-r--r-- | pygments/regexopt.py | 92 |
1 files changed, 92 insertions, 0 deletions
diff --git a/pygments/regexopt.py b/pygments/regexopt.py new file mode 100644 index 00000000..047c703f --- /dev/null +++ b/pygments/regexopt.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +""" + pygments.regexopt + ~~~~~~~~~~~~~~~~~ + + An algorithm that generates optimized regexes for matching long lists of + literal strings. + + :copyright: Copyright 2006-2015 by the Pygments team, see AUTHORS. + :license: BSD, see LICENSE for details. +""" + +import re +from re import escape +from os.path import commonprefix +from itertools import groupby +from operator import itemgetter + +CS_ESCAPE = re.compile(r'[\^\\\-\]]') +FIRST_ELEMENT = itemgetter(0) + + +def make_charset(letters): + return '[' + CS_ESCAPE.sub(lambda m: '\\' + m.group(), ''.join(letters)) + ']' + + +def regex_opt_inner(strings, open_paren): + """Return a regex that matches any string in the sorted list of strings.""" + close_paren = open_paren and ')' or '' + # print strings, repr(open_paren) + if not strings: + # print '-> nothing left' + return '' + first = strings[0] + if len(strings) == 1: + # print '-> only 1 string' + return open_paren + escape(first) + close_paren + if not first: + # print '-> first string empty' + return open_paren + regex_opt_inner(strings[1:], '(?:') \ + + '?' + close_paren + if len(first) == 1: + # multiple one-char strings? make a charset + oneletter = [] + rest = [] + for s in strings: + if len(s) == 1: + oneletter.append(s) + else: + rest.append(s) + if len(oneletter) > 1: # do we have more than one oneletter string? + if rest: + # print '-> 1-character + rest' + return open_paren + regex_opt_inner(rest, '') + '|' \ + + make_charset(oneletter) + close_paren + # print '-> only 1-character' + return open_paren + make_charset(oneletter) + close_paren + prefix = commonprefix(strings) + if prefix: + plen = len(prefix) + # we have a prefix for all strings + # print '-> prefix:', prefix + return open_paren + escape(prefix) \ + + regex_opt_inner([s[plen:] for s in strings], '(?:') \ + + close_paren + # is there a suffix? + strings_rev = [s[::-1] for s in strings] + suffix = commonprefix(strings_rev) + if suffix: + slen = len(suffix) + # print '-> suffix:', suffix[::-1] + return open_paren \ + + regex_opt_inner(sorted(s[:-slen] for s in strings), '(?:') \ + + escape(suffix[::-1]) + close_paren + # recurse on common 1-string prefixes + # print '-> last resort' + return open_paren + \ + '|'.join(regex_opt_inner(list(group[1]), '') + for group in groupby(strings, lambda s: s[0] == first[0])) \ + + close_paren + + +def regex_opt(strings, prefix='', suffix=''): + """Return a compiled regex that matches any string in the given list. + + The strings to match must be literal strings, not regexes. They will be + regex-escaped. + + *prefix* and *suffix* are pre- and appended to the final regex. + """ + strings = sorted(strings) + return prefix + regex_opt_inner(strings, '(') + suffix |