diff options
author | blackbird <devnull@localhost> | 2006-10-27 23:59:07 +0200 |
---|---|---|
committer | blackbird <devnull@localhost> | 2006-10-27 23:59:07 +0200 |
commit | 0b4ae9ab3fa6057dce2833a3e34ba01511c10e44 (patch) | |
tree | f4afa530b3b9aae10144448d0aaa25bc990fb482 /pygments | |
parent | a400243228ed76501b820f2a6d0e7f924d5f9882 (diff) | |
download | pygments-0b4ae9ab3fa6057dce2833a3e34ba01511c10e44.tar.gz |
[svn] checked in changes from the last days. including:
- text in logo
- documentation update
- new `guess_lexer` method
Diffstat (limited to 'pygments')
-rw-r--r-- | pygments/lexer.py | 46 | ||||
-rw-r--r-- | pygments/lexers/__init__.py | 31 | ||||
-rw-r--r-- | pygments/lexers/agile.py | 11 | ||||
-rw-r--r-- | pygments/lexers/templates.py | 78 | ||||
-rw-r--r-- | pygments/lexers/web.py | 18 | ||||
-rw-r--r-- | pygments/util.py | 102 |
6 files changed, 275 insertions, 11 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py index 6f57f31d..b5320ce2 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -1,23 +1,39 @@ # -*- coding: utf-8 -*- """ - pygments.lexer - ~~~~~~~~~~~~~~ +pygments.lexer +~~~~~~~~~~~~~~ - Base lexer classes. +Base lexer classes. - :copyright: 2006 by Georg Brandl. - :license: GNU LGPL, see LICENSE for more details. +:copyright: 2006 by Georg Brandl. +:license: GNU LGPL, see LICENSE for more details. """ import re +from types import FunctionType from pygments.token import Error, Text, Other, _TokenType -from pygments.util import get_bool_opt, get_int_opt +from pygments.util import get_bool_opt, get_int_opt, make_analysator __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer', 'LexerContext', 'include', 'flags', 'bygroups', 'using', 'this'] +_default_analyse = staticmethod(lambda x: 0.0) + + +class LexerMeta(type): + """ + This metaclass automagically converts ``analyse_text`` methods into + static methods which always return float values. + """ + + def __new__(cls, name, bases, d): + if 'analyse_text' in d: + d['analyse_text'] = make_analysator(d['analyse_text']) + return type.__new__(cls, name, bases, d) + + class Lexer(object): """ Lexer for a specific language. @@ -41,12 +57,28 @@ class Lexer(object): #: fn match rules filenames = [] + __metaclass__ = LexerMeta + def __init__(self, **options): self.options = options self.stripnl = get_bool_opt(options, 'stripnl', True) self.stripall = get_bool_opt(options, 'stripall', False) self.tabsize = get_int_opt(options, 'tabsize', 0) + def analyse_text(text): + """ + Has to return an float between ``0`` and ``1`` that indicates + if a lexer wants to highighlight that. used by ``guess_lexer``. + If this method returns ``0`` it won't highlight it at all, if + it returns ``1`` highlighting with this lexer is guaranteed. + + The `LexerMeta` metaclass automatically wraps this function so + that it works like a static method (no ``self`` or ``cls`` + parameter) and the return value is automatically converted to + `float`. If the return value is an object that is boolean `False` + it's the same as if the return values was ``0.0``. + """ + def get_tokens(self, text): """ Return an iterable of (tokentype, value) pairs generated from ``text``. @@ -216,7 +248,7 @@ def using(_other, **kwargs): return callback -class RegexLexerMeta(type): +class RegexLexerMeta(LexerMeta): """ Metaclass for RegexLexer, creates the self._tokens attribute from self.tokens on the first instantiation. diff --git a/pygments/lexers/__init__.py b/pygments/lexers/__init__.py index 99ec4a79..0f7c0483 100644 --- a/pygments/lexers/__init__.py +++ b/pygments/lexers/__init__.py @@ -16,7 +16,8 @@ from pygments.lexers._mapping import LEXERS from pygments.plugin import find_plugin_lexers -__all__ = ['get_lexer_by_name', 'get_lexer_for_filename'] + LEXERS.keys() +__all__ = ['get_lexer_by_name', 'get_lexer_for_filename', + 'guess_lexer'] + LEXERS.keys() _lexer_cache = {} @@ -68,6 +69,34 @@ def get_lexer_for_filename(fn, **options): raise ValueError('no lexer for filename %r found' % fn) +def guess_lexer(text, **options): + """ + Guess a lexer by strong distinctions in the text (eg, shebang). + """ + best_lexer = [0.0, None] + # builtin lexers + for module_name, name, _, _ in LEXERS.itervalues(): + if name not in _lexer_cache: + _load_lexers(module_name) + lexer = _lexer_cache[name] + rv = lexer.analyse_text(text) + if rv == 1.0: + return lexer(**options) + if rv > best_lexer[0]: + best_lexer[:] = (rv, lexer) + # plugin lexers + for lexer in find_plugin_lexers(): + rv = lexer.analyse_text(text) + if rv == 1.0: + return lexer(**options) + if rv > best_lexer[0]: + best_lexer[:] = (rv, lexer) + if best_lexer[0] == 0.0 or best_lexer[1] is None: + from pygments.lexers.special import TextLexer + return TextLexer(**options) + return best_lexer[1](**options) + + class _automodule(types.ModuleType): def __getattr__(self, name): diff --git a/pygments/lexers/agile.py b/pygments/lexers/agile.py index 61292824..aec32450 100644 --- a/pygments/lexers/agile.py +++ b/pygments/lexers/agile.py @@ -19,7 +19,7 @@ from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, \ LexerContext, include, combined, do_insertions, bygroups from pygments.token import Error, Text, \ Comment, Operator, Keyword, Name, String, Number, Generic -from pygments.util import get_bool_opt, get_list_opt +from pygments.util import get_bool_opt, get_list_opt, shebang_matches __all__ = ['PythonLexer', 'PythonConsoleLexer', 'RubyLexer', @@ -145,6 +145,9 @@ class PythonLexer(RegexLexer): ], } + def analyse_text(text): + return shebang_matches(text, r'pythonw?(2\.\d)?') + class PythonConsoleLexer(Lexer): """ @@ -504,6 +507,9 @@ class RubyLexer(ExtendedRegexLexer): } tokens.update(gen_rubystrings_rules()) + def analyse_text(text): + return shebang_matches(text, r'ruby(1\.\d)?') + class RubyConsoleLexer(Lexer): """ @@ -675,6 +681,9 @@ class PerlLexer(RegexLexer): ] } + def analyse_text(text): + return shebang_matches(text, r'perl(\d\.\d\.\d)?') + class LuaLexer(RegexLexer): name = 'Lua' diff --git a/pygments/lexers/templates.py b/pygments/lexers/templates.py index b8a14008..e9771af5 100644 --- a/pygments/lexers/templates.py +++ b/pygments/lexers/templates.py @@ -21,6 +21,7 @@ from pygments.lexer import \ Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, include, using from pygments.token import \ Text, Comment, Operator, Keyword, Name, String, Number, Other +from pygments.util import html_doctype_matches, looks_like_xml __all__ = ['HtmlPhpLexer', 'XmlPhpLexer', 'CssPhpLexer', 'JavascriptPhpLexer', 'ErbLexer', 'RhtmlLexer', @@ -110,6 +111,10 @@ class ErbLexer(Lexer): except IndexError: return + def analyse_text(text): + if '<%' in text and '%>' in text: + return 0.4 + class SmartyLexer(RegexLexer): name = 'Smarty' @@ -118,7 +123,7 @@ class SmartyLexer(RegexLexer): flags = re.MULTILINE | re.DOTALL tokens = { - # XXX: make marty delimiters customizable somehow + # XXX: make smarty delimiters customizable somehow 'root': [ (r'[^{]+', Other), (r'(\{)(\*.*?\*)(\})', @@ -145,6 +150,18 @@ class SmartyLexer(RegexLexer): ] } + def analyse_text(text): + rv = 0.0 + if re.search('\{if\s+.*?\}.*?\{/if\}', text): + rv += 0.15 + if re.search('\{include\s+file=.*?\}', text): + rv += 0.15 + if re.search('\{foreach\s+.*?\}.*?\{/foreach\}', text): + rv += 0.15 + if re.search('\{\$.*?\}', text): + rv += 0.01 + return rv + class DjangoLexer(RegexLexer): name = 'django template' @@ -188,6 +205,16 @@ class DjangoLexer(RegexLexer): ] } + def analyse_text(text): + rv = 0.0 + if re.search(r'\{\%\s*(block|extends)', text) is not None: + rv += 0.4 + if re.search(r'\{\%\s*if\s*.*?\%\}', text) is not None: + rv += 0.1 + if re.search(r'\{\{.*?\}\}', text) is not None: + rv += 0.1 + return rv + class RhtmlLexer(DelegatingLexer): name = 'RHTML' @@ -197,6 +224,13 @@ class RhtmlLexer(DelegatingLexer): def __init__(self, **options): super(RhtmlLexer, self).__init__(HtmlLexer, ErbLexer, **options) + def analyse_text(text): + rv = ErbLexer.analyse_text(text) - 0.01 + if html_doctype_matches(text): + # one more than the XmlErbLexer returns + rv += 0.5 + return rv + class XmlErbLexer(DelegatingLexer): name = 'XML+Ruby' @@ -205,6 +239,12 @@ class XmlErbLexer(DelegatingLexer): def __init__(self, **options): super(XmlErbLexer, self).__init__(XmlLexer, ErbLexer, **options) + def analyse_text(text): + rv = ErbLexer.analyse_text(text) - 0.01 + if looks_like_xml(text): + rv += 0.4 + return rv + class CssErbLexer(DelegatingLexer): name = 'CSS+Ruby' @@ -231,6 +271,12 @@ class HtmlPhpLexer(DelegatingLexer): def __init__(self, **options): super(HtmlPhpLexer, self).__init__(HtmlLexer, PhpLexer, **options) + def analyse_text(text): + rv = PhpLexer.analyse_text(text) - 0.01 + if html_doctype_matches(text): + rv += 0.5 + return rv + class XmlPhpLexer(DelegatingLexer): name = 'XML+PHP' @@ -239,6 +285,12 @@ class XmlPhpLexer(DelegatingLexer): def __init__(self, **options): super(XmlPhpLexer, self).__init__(XmlLexer, PhpLexer, **options) + def analyse_text(text): + rv = PhpLexer.analyse_text(text) - 0.01 + if looks_like_xml(text): + rv += 0.4 + return rv + class CssPhpLexer(DelegatingLexer): name = 'CSS+PHP' @@ -264,6 +316,12 @@ class HtmlSmartyLexer(DelegatingLexer): def __init__(self, **options): super(HtmlSmartyLexer, self).__init__(HtmlLexer, SmartyLexer, **options) + def analyse_text(text): + rv = SmartyLexer.analyse_text(text) - 0.01 + if html_doctype_matches(text): + rv += 0.5 + return rv + class XmlSmartyLexer(DelegatingLexer): name = 'XML+Smarty' @@ -272,6 +330,12 @@ class XmlSmartyLexer(DelegatingLexer): def __init__(self, **options): super(XmlSmartyLexer, self).__init__(XmlLexer, SmartyLexer, **options) + def analyse_text(text): + rv = SmartyLexer.analyse_text(text) - 0.01 + if looks_like_xml(text): + rv += 0.4 + return rv + class CssSmartyLexer(DelegatingLexer): name = 'CSS+Smarty' @@ -297,6 +361,12 @@ class HtmlDjangoLexer(DelegatingLexer): def __init__(self, **options): super(HtmlDjangoLexer, self).__init__(HtmlLexer, DjangoLexer, **options) + def analyse_text(text): + rv = DjangoLexer.analyse_text(text) - 0.01 + if html_doctype_matches(text): + rv += 0.5 + return rv + class XmlDjangoLexer(DelegatingLexer): name = 'XML+Django' @@ -305,6 +375,12 @@ class XmlDjangoLexer(DelegatingLexer): def __init__(self, **options): super(XmlDjangoLexer, self).__init__(XmlLexer, DjangoLexer, **options) + def analyse_text(text): + rv = DjangoLexer.analyse_text(text) - 0.01 + if looks_like_xml(text): + rv += 0.4 + return rv + class CssDjangoLexer(DelegatingLexer): name = 'CSS+Django' diff --git a/pygments/lexers/web.py b/pygments/lexers/web.py index 0b90e6cd..939144dd 100644 --- a/pygments/lexers/web.py +++ b/pygments/lexers/web.py @@ -18,7 +18,8 @@ except NameError: from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, using from pygments.token import \ Text, Comment, Operator, Keyword, Name, String, Number, Other -from pygments.util import get_bool_opt, get_list_opt +from pygments.util import get_bool_opt, get_list_opt, looks_like_xml, \ + html_doctype_matches __all__ = ['HtmlLexer', 'XmlLexer', 'JavascriptLexer', 'CssLexer', @@ -217,6 +218,10 @@ class HtmlLexer(RegexLexer): ], } + def analyse_text(text): + if html_doctype_matches(text): + return 0.5 + class PhpLexer(RegexLexer): name = 'PHP' @@ -296,6 +301,13 @@ class PhpLexer(RegexLexer): continue yield index, token, value + def analyse_text(text): + rv = 0.0 + for tag in '<?php', '?>': + if tag in text: + rv += 0.2 + return rv + class XmlLexer(RegexLexer): flags = re.MULTILINE | re.DOTALL @@ -332,3 +344,7 @@ class XmlLexer(RegexLexer): (r'[^\s>]+', String, '#pop'), ], } + + def analyse_text(text): + if looks_like_xml(text): + return 0.5 diff --git a/pygments/util.py b/pygments/util.py index 4a427fb8..8ad994c6 100644 --- a/pygments/util.py +++ b/pygments/util.py @@ -9,6 +9,19 @@ :copyright: 2006 by Georg Brandl. :license: GNU LGPL, see LICENSE for more details. """ +import re + + +split_path_re = re.compile(r'[/\\ ]') +doctype_lookup_re = re.compile(r'''(?smx) + (<\?.*?\?>)?\s* + <!DOCTYPE\s+( + [a-zA-Z_][a-zA-Z0-9]*\s+ + [a-zA-Z_][a-zA-Z0-9]*\s+ + "[^"]*") + [^>]+> +''') +tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</\1>(?uism)') class OptionError(Exception): @@ -49,3 +62,92 @@ def get_list_opt(options, optname, default=None): raise OptionError('Invalid value %r for option %s; you ' 'must give a list value' % val, optname) + + +def make_analysator(f): + """ + Return a static text analysation function that + returns float values. + """ + def text_analyse(text): + rv = f(text) + if not rv: + return 0.0 + return min(1.0, max(0.0, float(rv))) + text_analyse.__doc__ = f.__doc__ + return staticmethod(text_analyse) + + +def shebang_matches(text, regex): + """ + Check if the given regular expression matches the last part of the + shebang if one exists. + + >>> from pygments.util import shebang_matches + >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?') + True + >>> shebang_matches('#!/usr/bin/python2.4', r'python(2\.\d)?') + True + >>> shebang_matches('#!/usr/bin/python-ruby', r'python(2\.\d)?') + False + >>> shebang_matches('#!/usr/bin/python/ruby', r'python(2\.\d)?') + False + >>> shebang_matches('#!/usr/bin/startsomethingwith python', + ... r'python(2\.\d)?') + True + + It also checks for common windows executable file extensions:: + + >>> shebang_matches('#!C:\\Python2.4\\Python.exe', r'python(2\.\d)?') + True + + Parameters (``'-f'`` or ``'--foo'`` are ignored so ``'perl'`` does + the same as ``'perl -e'``) + + Note that this method automatically searches the whole string (eg: + the regular expression is wrapped in ``'^$'``) + """ + if '\n' in text: + first_line = text[:text.index('\n')].lower() + else: + first_line = text.lower() + if first_line.startswith('#!'): + try: + found = [x for x in split_path_re.split(first_line[2:].strip()) + if x and not x.startswith('-')][-1] + except IndexError: + return False + regex = re.compile('^%s(\.(exe|cmd|bat|bin))?$' % regex, re.IGNORECASE) + if regex.search(found) is not None: + return True + return False + + +def doctype_matches(text, regex): + """ + Check if the doctype matches a regular expression (if present). + Note that this method only checks the first part of a DOCTYPE. + eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"' + """ + m = doctype_lookup_re.match(text) + if m is None: + return False + doctype = m.group(2) + return re.compile(regex).match(doctype.strip()) is not None + + +def html_doctype_matches(text): + """ + Check if the file looks like it has a html doctype + """ + return doctype_matches(text, r'html\s+PUBLIC\s+"-//W3C//DTD X?HTML.*') + + +def looks_like_xml(text): + """ + Check if a doctype exists or if we have some tags + """ + m = doctype_lookup_re.match(text) + if m is not None: + return True + return tag_re.search(text) is not None |