summaryrefslogtreecommitdiff
path: root/pygments
diff options
context:
space:
mode:
authorblackbird <devnull@localhost>2006-10-27 23:59:07 +0200
committerblackbird <devnull@localhost>2006-10-27 23:59:07 +0200
commit0b4ae9ab3fa6057dce2833a3e34ba01511c10e44 (patch)
treef4afa530b3b9aae10144448d0aaa25bc990fb482 /pygments
parenta400243228ed76501b820f2a6d0e7f924d5f9882 (diff)
downloadpygments-0b4ae9ab3fa6057dce2833a3e34ba01511c10e44.tar.gz
[svn] checked in changes from the last days. including:
- text in logo - documentation update - new `guess_lexer` method
Diffstat (limited to 'pygments')
-rw-r--r--pygments/lexer.py46
-rw-r--r--pygments/lexers/__init__.py31
-rw-r--r--pygments/lexers/agile.py11
-rw-r--r--pygments/lexers/templates.py78
-rw-r--r--pygments/lexers/web.py18
-rw-r--r--pygments/util.py102
6 files changed, 275 insertions, 11 deletions
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 6f57f31d..b5320ce2 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -1,23 +1,39 @@
# -*- coding: utf-8 -*-
"""
- pygments.lexer
- ~~~~~~~~~~~~~~
+pygments.lexer
+~~~~~~~~~~~~~~
- Base lexer classes.
+Base lexer classes.
- :copyright: 2006 by Georg Brandl.
- :license: GNU LGPL, see LICENSE for more details.
+:copyright: 2006 by Georg Brandl.
+:license: GNU LGPL, see LICENSE for more details.
"""
import re
+from types import FunctionType
from pygments.token import Error, Text, Other, _TokenType
-from pygments.util import get_bool_opt, get_int_opt
+from pygments.util import get_bool_opt, get_int_opt, make_analysator
__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
'LexerContext', 'include', 'flags', 'bygroups', 'using', 'this']
+_default_analyse = staticmethod(lambda x: 0.0)
+
+
+class LexerMeta(type):
+ """
+ This metaclass automagically converts ``analyse_text`` methods into
+ static methods which always return float values.
+ """
+
+ def __new__(cls, name, bases, d):
+ if 'analyse_text' in d:
+ d['analyse_text'] = make_analysator(d['analyse_text'])
+ return type.__new__(cls, name, bases, d)
+
+
class Lexer(object):
"""
Lexer for a specific language.
@@ -41,12 +57,28 @@ class Lexer(object):
#: fn match rules
filenames = []
+ __metaclass__ = LexerMeta
+
def __init__(self, **options):
self.options = options
self.stripnl = get_bool_opt(options, 'stripnl', True)
self.stripall = get_bool_opt(options, 'stripall', False)
self.tabsize = get_int_opt(options, 'tabsize', 0)
+ def analyse_text(text):
+ """
+ Has to return an float between ``0`` and ``1`` that indicates
+ if a lexer wants to highighlight that. used by ``guess_lexer``.
+ If this method returns ``0`` it won't highlight it at all, if
+ it returns ``1`` highlighting with this lexer is guaranteed.
+
+ The `LexerMeta` metaclass automatically wraps this function so
+ that it works like a static method (no ``self`` or ``cls``
+ parameter) and the return value is automatically converted to
+ `float`. If the return value is an object that is boolean `False`
+ it's the same as if the return values was ``0.0``.
+ """
+
def get_tokens(self, text):
"""
Return an iterable of (tokentype, value) pairs generated from ``text``.
@@ -216,7 +248,7 @@ def using(_other, **kwargs):
return callback
-class RegexLexerMeta(type):
+class RegexLexerMeta(LexerMeta):
"""
Metaclass for RegexLexer, creates the self._tokens attribute from
self.tokens on the first instantiation.
diff --git a/pygments/lexers/__init__.py b/pygments/lexers/__init__.py
index 99ec4a79..0f7c0483 100644
--- a/pygments/lexers/__init__.py
+++ b/pygments/lexers/__init__.py
@@ -16,7 +16,8 @@ from pygments.lexers._mapping import LEXERS
from pygments.plugin import find_plugin_lexers
-__all__ = ['get_lexer_by_name', 'get_lexer_for_filename'] + LEXERS.keys()
+__all__ = ['get_lexer_by_name', 'get_lexer_for_filename',
+ 'guess_lexer'] + LEXERS.keys()
_lexer_cache = {}
@@ -68,6 +69,34 @@ def get_lexer_for_filename(fn, **options):
raise ValueError('no lexer for filename %r found' % fn)
+def guess_lexer(text, **options):
+ """
+ Guess a lexer by strong distinctions in the text (eg, shebang).
+ """
+ best_lexer = [0.0, None]
+ # builtin lexers
+ for module_name, name, _, _ in LEXERS.itervalues():
+ if name not in _lexer_cache:
+ _load_lexers(module_name)
+ lexer = _lexer_cache[name]
+ rv = lexer.analyse_text(text)
+ if rv == 1.0:
+ return lexer(**options)
+ if rv > best_lexer[0]:
+ best_lexer[:] = (rv, lexer)
+ # plugin lexers
+ for lexer in find_plugin_lexers():
+ rv = lexer.analyse_text(text)
+ if rv == 1.0:
+ return lexer(**options)
+ if rv > best_lexer[0]:
+ best_lexer[:] = (rv, lexer)
+ if best_lexer[0] == 0.0 or best_lexer[1] is None:
+ from pygments.lexers.special import TextLexer
+ return TextLexer(**options)
+ return best_lexer[1](**options)
+
+
class _automodule(types.ModuleType):
def __getattr__(self, name):
diff --git a/pygments/lexers/agile.py b/pygments/lexers/agile.py
index 61292824..aec32450 100644
--- a/pygments/lexers/agile.py
+++ b/pygments/lexers/agile.py
@@ -19,7 +19,7 @@ from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, \
LexerContext, include, combined, do_insertions, bygroups
from pygments.token import Error, Text, \
Comment, Operator, Keyword, Name, String, Number, Generic
-from pygments.util import get_bool_opt, get_list_opt
+from pygments.util import get_bool_opt, get_list_opt, shebang_matches
__all__ = ['PythonLexer', 'PythonConsoleLexer', 'RubyLexer',
@@ -145,6 +145,9 @@ class PythonLexer(RegexLexer):
],
}
+ def analyse_text(text):
+ return shebang_matches(text, r'pythonw?(2\.\d)?')
+
class PythonConsoleLexer(Lexer):
"""
@@ -504,6 +507,9 @@ class RubyLexer(ExtendedRegexLexer):
}
tokens.update(gen_rubystrings_rules())
+ def analyse_text(text):
+ return shebang_matches(text, r'ruby(1\.\d)?')
+
class RubyConsoleLexer(Lexer):
"""
@@ -675,6 +681,9 @@ class PerlLexer(RegexLexer):
]
}
+ def analyse_text(text):
+ return shebang_matches(text, r'perl(\d\.\d\.\d)?')
+
class LuaLexer(RegexLexer):
name = 'Lua'
diff --git a/pygments/lexers/templates.py b/pygments/lexers/templates.py
index b8a14008..e9771af5 100644
--- a/pygments/lexers/templates.py
+++ b/pygments/lexers/templates.py
@@ -21,6 +21,7 @@ from pygments.lexer import \
Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, include, using
from pygments.token import \
Text, Comment, Operator, Keyword, Name, String, Number, Other
+from pygments.util import html_doctype_matches, looks_like_xml
__all__ = ['HtmlPhpLexer', 'XmlPhpLexer', 'CssPhpLexer',
'JavascriptPhpLexer', 'ErbLexer', 'RhtmlLexer',
@@ -110,6 +111,10 @@ class ErbLexer(Lexer):
except IndexError:
return
+ def analyse_text(text):
+ if '<%' in text and '%>' in text:
+ return 0.4
+
class SmartyLexer(RegexLexer):
name = 'Smarty'
@@ -118,7 +123,7 @@ class SmartyLexer(RegexLexer):
flags = re.MULTILINE | re.DOTALL
tokens = {
- # XXX: make marty delimiters customizable somehow
+ # XXX: make smarty delimiters customizable somehow
'root': [
(r'[^{]+', Other),
(r'(\{)(\*.*?\*)(\})',
@@ -145,6 +150,18 @@ class SmartyLexer(RegexLexer):
]
}
+ def analyse_text(text):
+ rv = 0.0
+ if re.search('\{if\s+.*?\}.*?\{/if\}', text):
+ rv += 0.15
+ if re.search('\{include\s+file=.*?\}', text):
+ rv += 0.15
+ if re.search('\{foreach\s+.*?\}.*?\{/foreach\}', text):
+ rv += 0.15
+ if re.search('\{\$.*?\}', text):
+ rv += 0.01
+ return rv
+
class DjangoLexer(RegexLexer):
name = 'django template'
@@ -188,6 +205,16 @@ class DjangoLexer(RegexLexer):
]
}
+ def analyse_text(text):
+ rv = 0.0
+ if re.search(r'\{\%\s*(block|extends)', text) is not None:
+ rv += 0.4
+ if re.search(r'\{\%\s*if\s*.*?\%\}', text) is not None:
+ rv += 0.1
+ if re.search(r'\{\{.*?\}\}', text) is not None:
+ rv += 0.1
+ return rv
+
class RhtmlLexer(DelegatingLexer):
name = 'RHTML'
@@ -197,6 +224,13 @@ class RhtmlLexer(DelegatingLexer):
def __init__(self, **options):
super(RhtmlLexer, self).__init__(HtmlLexer, ErbLexer, **options)
+ def analyse_text(text):
+ rv = ErbLexer.analyse_text(text) - 0.01
+ if html_doctype_matches(text):
+ # one more than the XmlErbLexer returns
+ rv += 0.5
+ return rv
+
class XmlErbLexer(DelegatingLexer):
name = 'XML+Ruby'
@@ -205,6 +239,12 @@ class XmlErbLexer(DelegatingLexer):
def __init__(self, **options):
super(XmlErbLexer, self).__init__(XmlLexer, ErbLexer, **options)
+ def analyse_text(text):
+ rv = ErbLexer.analyse_text(text) - 0.01
+ if looks_like_xml(text):
+ rv += 0.4
+ return rv
+
class CssErbLexer(DelegatingLexer):
name = 'CSS+Ruby'
@@ -231,6 +271,12 @@ class HtmlPhpLexer(DelegatingLexer):
def __init__(self, **options):
super(HtmlPhpLexer, self).__init__(HtmlLexer, PhpLexer, **options)
+ def analyse_text(text):
+ rv = PhpLexer.analyse_text(text) - 0.01
+ if html_doctype_matches(text):
+ rv += 0.5
+ return rv
+
class XmlPhpLexer(DelegatingLexer):
name = 'XML+PHP'
@@ -239,6 +285,12 @@ class XmlPhpLexer(DelegatingLexer):
def __init__(self, **options):
super(XmlPhpLexer, self).__init__(XmlLexer, PhpLexer, **options)
+ def analyse_text(text):
+ rv = PhpLexer.analyse_text(text) - 0.01
+ if looks_like_xml(text):
+ rv += 0.4
+ return rv
+
class CssPhpLexer(DelegatingLexer):
name = 'CSS+PHP'
@@ -264,6 +316,12 @@ class HtmlSmartyLexer(DelegatingLexer):
def __init__(self, **options):
super(HtmlSmartyLexer, self).__init__(HtmlLexer, SmartyLexer, **options)
+ def analyse_text(text):
+ rv = SmartyLexer.analyse_text(text) - 0.01
+ if html_doctype_matches(text):
+ rv += 0.5
+ return rv
+
class XmlSmartyLexer(DelegatingLexer):
name = 'XML+Smarty'
@@ -272,6 +330,12 @@ class XmlSmartyLexer(DelegatingLexer):
def __init__(self, **options):
super(XmlSmartyLexer, self).__init__(XmlLexer, SmartyLexer, **options)
+ def analyse_text(text):
+ rv = SmartyLexer.analyse_text(text) - 0.01
+ if looks_like_xml(text):
+ rv += 0.4
+ return rv
+
class CssSmartyLexer(DelegatingLexer):
name = 'CSS+Smarty'
@@ -297,6 +361,12 @@ class HtmlDjangoLexer(DelegatingLexer):
def __init__(self, **options):
super(HtmlDjangoLexer, self).__init__(HtmlLexer, DjangoLexer, **options)
+ def analyse_text(text):
+ rv = DjangoLexer.analyse_text(text) - 0.01
+ if html_doctype_matches(text):
+ rv += 0.5
+ return rv
+
class XmlDjangoLexer(DelegatingLexer):
name = 'XML+Django'
@@ -305,6 +375,12 @@ class XmlDjangoLexer(DelegatingLexer):
def __init__(self, **options):
super(XmlDjangoLexer, self).__init__(XmlLexer, DjangoLexer, **options)
+ def analyse_text(text):
+ rv = DjangoLexer.analyse_text(text) - 0.01
+ if looks_like_xml(text):
+ rv += 0.4
+ return rv
+
class CssDjangoLexer(DelegatingLexer):
name = 'CSS+Django'
diff --git a/pygments/lexers/web.py b/pygments/lexers/web.py
index 0b90e6cd..939144dd 100644
--- a/pygments/lexers/web.py
+++ b/pygments/lexers/web.py
@@ -18,7 +18,8 @@ except NameError:
from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, using
from pygments.token import \
Text, Comment, Operator, Keyword, Name, String, Number, Other
-from pygments.util import get_bool_opt, get_list_opt
+from pygments.util import get_bool_opt, get_list_opt, looks_like_xml, \
+ html_doctype_matches
__all__ = ['HtmlLexer', 'XmlLexer', 'JavascriptLexer', 'CssLexer',
@@ -217,6 +218,10 @@ class HtmlLexer(RegexLexer):
],
}
+ def analyse_text(text):
+ if html_doctype_matches(text):
+ return 0.5
+
class PhpLexer(RegexLexer):
name = 'PHP'
@@ -296,6 +301,13 @@ class PhpLexer(RegexLexer):
continue
yield index, token, value
+ def analyse_text(text):
+ rv = 0.0
+ for tag in '<?php', '?>':
+ if tag in text:
+ rv += 0.2
+ return rv
+
class XmlLexer(RegexLexer):
flags = re.MULTILINE | re.DOTALL
@@ -332,3 +344,7 @@ class XmlLexer(RegexLexer):
(r'[^\s>]+', String, '#pop'),
],
}
+
+ def analyse_text(text):
+ if looks_like_xml(text):
+ return 0.5
diff --git a/pygments/util.py b/pygments/util.py
index 4a427fb8..8ad994c6 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -9,6 +9,19 @@
:copyright: 2006 by Georg Brandl.
:license: GNU LGPL, see LICENSE for more details.
"""
+import re
+
+
+split_path_re = re.compile(r'[/\\ ]')
+doctype_lookup_re = re.compile(r'''(?smx)
+ (<\?.*?\?>)?\s*
+ <!DOCTYPE\s+(
+ [a-zA-Z_][a-zA-Z0-9]*\s+
+ [a-zA-Z_][a-zA-Z0-9]*\s+
+ "[^"]*")
+ [^>]+>
+''')
+tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</\1>(?uism)')
class OptionError(Exception):
@@ -49,3 +62,92 @@ def get_list_opt(options, optname, default=None):
raise OptionError('Invalid value %r for option %s; you '
'must give a list value' %
val, optname)
+
+
+def make_analysator(f):
+ """
+ Return a static text analysation function that
+ returns float values.
+ """
+ def text_analyse(text):
+ rv = f(text)
+ if not rv:
+ return 0.0
+ return min(1.0, max(0.0, float(rv)))
+ text_analyse.__doc__ = f.__doc__
+ return staticmethod(text_analyse)
+
+
+def shebang_matches(text, regex):
+ """
+ Check if the given regular expression matches the last part of the
+ shebang if one exists.
+
+ >>> from pygments.util import shebang_matches
+ >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?')
+ True
+ >>> shebang_matches('#!/usr/bin/python2.4', r'python(2\.\d)?')
+ True
+ >>> shebang_matches('#!/usr/bin/python-ruby', r'python(2\.\d)?')
+ False
+ >>> shebang_matches('#!/usr/bin/python/ruby', r'python(2\.\d)?')
+ False
+ >>> shebang_matches('#!/usr/bin/startsomethingwith python',
+ ... r'python(2\.\d)?')
+ True
+
+ It also checks for common windows executable file extensions::
+
+ >>> shebang_matches('#!C:\\Python2.4\\Python.exe', r'python(2\.\d)?')
+ True
+
+ Parameters (``'-f'`` or ``'--foo'`` are ignored so ``'perl'`` does
+ the same as ``'perl -e'``)
+
+ Note that this method automatically searches the whole string (eg:
+ the regular expression is wrapped in ``'^$'``)
+ """
+ if '\n' in text:
+ first_line = text[:text.index('\n')].lower()
+ else:
+ first_line = text.lower()
+ if first_line.startswith('#!'):
+ try:
+ found = [x for x in split_path_re.split(first_line[2:].strip())
+ if x and not x.startswith('-')][-1]
+ except IndexError:
+ return False
+ regex = re.compile('^%s(\.(exe|cmd|bat|bin))?$' % regex, re.IGNORECASE)
+ if regex.search(found) is not None:
+ return True
+ return False
+
+
+def doctype_matches(text, regex):
+ """
+ Check if the doctype matches a regular expression (if present).
+ Note that this method only checks the first part of a DOCTYPE.
+ eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
+ """
+ m = doctype_lookup_re.match(text)
+ if m is None:
+ return False
+ doctype = m.group(2)
+ return re.compile(regex).match(doctype.strip()) is not None
+
+
+def html_doctype_matches(text):
+ """
+ Check if the file looks like it has a html doctype
+ """
+ return doctype_matches(text, r'html\s+PUBLIC\s+"-//W3C//DTD X?HTML.*')
+
+
+def looks_like_xml(text):
+ """
+ Check if a doctype exists or if we have some tags
+ """
+ m = doctype_lookup_re.match(text)
+ if m is not None:
+ return True
+ return tag_re.search(text) is not None