summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--docs/generate.py92
-rw-r--r--docs/src/lexerdevelopment.txt31
-rw-r--r--pygments/lexer.py46
-rw-r--r--pygments/lexers/__init__.py31
-rw-r--r--pygments/lexers/agile.py11
-rw-r--r--pygments/lexers/templates.py78
-rw-r--r--pygments/lexers/web.py18
-rw-r--r--pygments/util.py102
8 files changed, 352 insertions, 57 deletions
diff --git a/docs/generate.py b/docs/generate.py
index abdc6316..8759042b 100644
--- a/docs/generate.py
+++ b/docs/generate.py
@@ -27,7 +27,7 @@ from pygments.lexers import get_lexer_by_name
from pygments.formatters import HtmlFormatter
-PYGMENTS_FORMATTER = HtmlFormatter(style='friendly', cssclass='syntax')
+PYGMENTS_FORMATTER = HtmlFormatter(style='pastie', cssclass='syntax')
USAGE = '''\
Usage: %s <mode> <destination> [<source.txt> ...]
@@ -91,15 +91,15 @@ body {
}
h1 {
- font-weight: normal;
- font-size: 40px;
- color: #09839A;
+ font-weight: normal;
+ font-size: 40px;
+ color: #09839A;
}
h2 {
- font-weight: normal;
- font-size: 30px;
- color: #C73F00;
+ font-weight: normal;
+ font-size: 30px;
+ color: #C73F00;
}
h1.heading {
@@ -111,87 +111,87 @@ h2.subheading {
}
h3 {
- margin-top: 30px;
+ margin-top: 30px;
}
table.docutils {
- border-collapse: collapse;
- border: 2px solid #aaa;
- margin: 0.5em 1.5em 0.5em 1.5em;
+ border-collapse: collapse;
+ border: 2px solid #aaa;
+ margin: 0.5em 1.5em 0.5em 1.5em;
}
table.docutils td {
- padding: 2px;
- border: 1px solid #ddd;
+ padding: 2px;
+ border: 1px solid #ddd;
}
p, li, dd, dt, blockquote {
- font-size: 15px;
- color: #333;
+ font-size: 15px;
+ color: #333;
}
p {
- line-height: 150%;
- margin-bottom: 0;
- margin-top: 10px;
+ line-height: 150%;
+ margin-bottom: 0;
+ margin-top: 10px;
}
hr {
- border-top: 1px solid #ccc;
- border-bottom: 0;
- border-right: 0;
- border-left: 0;
- margin-bottom: 10px;
- margin-top: 20px;
+ border-top: 1px solid #ccc;
+ border-bottom: 0;
+ border-right: 0;
+ border-left: 0;
+ margin-bottom: 10px;
+ margin-top: 20px;
}
dl {
- margin-left: 10px;
+ margin-left: 10px;
}
li, dt {
- margin-top: 5px;
+ margin-top: 5px;
}
dt {
- font-weight: bold;
+ font-weight: bold;
}
th {
- text-align: left;
+ text-align: left;
}
a {
- color: #990000;
+ color: #990000;
}
a:hover {
- color: #c73f00;
+ color: #c73f00;
}
pre {
- background-color: #f0f0f0;
- border-top: 1px solid #ccc;
- border-bottom: 1px solid #ccc;
- padding: 5px;
- font-size: 13px;
- font-family: Bitstream Vera Sans Mono,monospace;
+ background-color: #f9f9f9;
+ border-top: 1px solid #ccc;
+ border-bottom: 1px solid #ccc;
+ padding: 5px;
+ font-size: 13px;
+ font-family: Bitstream Vera Sans Mono,monospace;
}
tt {
- font-size: 13px;
- font-family: Bitstream Vera Sans Mono,monospace;
- color: black;
- padding: 1px 2px 1px 2px;
- background-color: #f0f0f0;
+ font-size: 13px;
+ font-family: Bitstream Vera Sans Mono,monospace;
+ color: black;
+ padding: 1px 2px 1px 2px;
+ background-color: #f0f0f0;
}
cite {
- /* abusing <cite>, it's generated by ReST for `x` */
- font-size: 13px;
- font-family: Bitstream Vera Sans Mono,monospace;
- font-weight: bold;
- font-style: normal;
+ /* abusing <cite>, it's generated by ReST for `x` */
+ font-size: 13px;
+ font-family: Bitstream Vera Sans Mono,monospace;
+ font-weight: bold;
+ font-style: normal;
}
#backlink {
diff --git a/docs/src/lexerdevelopment.txt b/docs/src/lexerdevelopment.txt
index c5d2921b..80822e1d 100644
--- a/docs/src/lexerdevelopment.txt
+++ b/docs/src/lexerdevelopment.txt
@@ -480,3 +480,34 @@ This might sound confusing (and it can really be). But it is needed, and for an
example look at the Ruby lexer in `agile.py`_.
.. _agile.py: http://trac.pocoo.org/repos/pygments/trunk/pygments/lexers/agile.py
+
+
+Filtering Token Streams
+=======================
+
+Some languages ship a lot of builtin functions (for example PHP). The total
+amount of those functions differs from system to system because not everybody
+has every extension installed. In the case of PHP there are over 3000 builtin
+functions. That's an incredible huge amount of functions, much more than you
+can put into a regular expression.
+
+But because only `Name` tokens can be function names it's solvable by overriding
+the ``get_tokens_unprocessed`` method. The following lexer subclasses the
+`PythonLexer` so that it highlights some additional names as pseudo keywords:
+
+.. sourcecode:: python
+
+ from pykleur.lexers.agile import PythonLexer
+ from pykleur.token import Name, Keyword
+
+ class MyPythonLexer(PythonLexer):
+ EXTRA_KEYWORDS = ['foo', 'bar', 'foobar', 'barfoo', 'spam', 'eggs']
+
+ def get_tokens_unprocessed(self, text):
+ for index, token, value in PythonLexer.get_tokens_unprocessed(self, text):
+ if token is Name and value in self.EXTRA_KEYWORDS:
+ yield index, Keyword.Pseudo, value
+ else:
+ yield index, token, value
+
+The `PhpLexer` and `LuaLexer` use this method to resolve builtin functions.
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 6f57f31d..b5320ce2 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -1,23 +1,39 @@
# -*- coding: utf-8 -*-
"""
- pygments.lexer
- ~~~~~~~~~~~~~~
+pygments.lexer
+~~~~~~~~~~~~~~
- Base lexer classes.
+Base lexer classes.
- :copyright: 2006 by Georg Brandl.
- :license: GNU LGPL, see LICENSE for more details.
+:copyright: 2006 by Georg Brandl.
+:license: GNU LGPL, see LICENSE for more details.
"""
import re
+from types import FunctionType
from pygments.token import Error, Text, Other, _TokenType
-from pygments.util import get_bool_opt, get_int_opt
+from pygments.util import get_bool_opt, get_int_opt, make_analysator
__all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
'LexerContext', 'include', 'flags', 'bygroups', 'using', 'this']
+_default_analyse = staticmethod(lambda x: 0.0)
+
+
+class LexerMeta(type):
+ """
+ This metaclass automagically converts ``analyse_text`` methods into
+ static methods which always return float values.
+ """
+
+ def __new__(cls, name, bases, d):
+ if 'analyse_text' in d:
+ d['analyse_text'] = make_analysator(d['analyse_text'])
+ return type.__new__(cls, name, bases, d)
+
+
class Lexer(object):
"""
Lexer for a specific language.
@@ -41,12 +57,28 @@ class Lexer(object):
#: fn match rules
filenames = []
+ __metaclass__ = LexerMeta
+
def __init__(self, **options):
self.options = options
self.stripnl = get_bool_opt(options, 'stripnl', True)
self.stripall = get_bool_opt(options, 'stripall', False)
self.tabsize = get_int_opt(options, 'tabsize', 0)
+ def analyse_text(text):
+ """
+ Has to return an float between ``0`` and ``1`` that indicates
+ if a lexer wants to highighlight that. used by ``guess_lexer``.
+ If this method returns ``0`` it won't highlight it at all, if
+ it returns ``1`` highlighting with this lexer is guaranteed.
+
+ The `LexerMeta` metaclass automatically wraps this function so
+ that it works like a static method (no ``self`` or ``cls``
+ parameter) and the return value is automatically converted to
+ `float`. If the return value is an object that is boolean `False`
+ it's the same as if the return values was ``0.0``.
+ """
+
def get_tokens(self, text):
"""
Return an iterable of (tokentype, value) pairs generated from ``text``.
@@ -216,7 +248,7 @@ def using(_other, **kwargs):
return callback
-class RegexLexerMeta(type):
+class RegexLexerMeta(LexerMeta):
"""
Metaclass for RegexLexer, creates the self._tokens attribute from
self.tokens on the first instantiation.
diff --git a/pygments/lexers/__init__.py b/pygments/lexers/__init__.py
index 99ec4a79..0f7c0483 100644
--- a/pygments/lexers/__init__.py
+++ b/pygments/lexers/__init__.py
@@ -16,7 +16,8 @@ from pygments.lexers._mapping import LEXERS
from pygments.plugin import find_plugin_lexers
-__all__ = ['get_lexer_by_name', 'get_lexer_for_filename'] + LEXERS.keys()
+__all__ = ['get_lexer_by_name', 'get_lexer_for_filename',
+ 'guess_lexer'] + LEXERS.keys()
_lexer_cache = {}
@@ -68,6 +69,34 @@ def get_lexer_for_filename(fn, **options):
raise ValueError('no lexer for filename %r found' % fn)
+def guess_lexer(text, **options):
+ """
+ Guess a lexer by strong distinctions in the text (eg, shebang).
+ """
+ best_lexer = [0.0, None]
+ # builtin lexers
+ for module_name, name, _, _ in LEXERS.itervalues():
+ if name not in _lexer_cache:
+ _load_lexers(module_name)
+ lexer = _lexer_cache[name]
+ rv = lexer.analyse_text(text)
+ if rv == 1.0:
+ return lexer(**options)
+ if rv > best_lexer[0]:
+ best_lexer[:] = (rv, lexer)
+ # plugin lexers
+ for lexer in find_plugin_lexers():
+ rv = lexer.analyse_text(text)
+ if rv == 1.0:
+ return lexer(**options)
+ if rv > best_lexer[0]:
+ best_lexer[:] = (rv, lexer)
+ if best_lexer[0] == 0.0 or best_lexer[1] is None:
+ from pygments.lexers.special import TextLexer
+ return TextLexer(**options)
+ return best_lexer[1](**options)
+
+
class _automodule(types.ModuleType):
def __getattr__(self, name):
diff --git a/pygments/lexers/agile.py b/pygments/lexers/agile.py
index 61292824..aec32450 100644
--- a/pygments/lexers/agile.py
+++ b/pygments/lexers/agile.py
@@ -19,7 +19,7 @@ from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, \
LexerContext, include, combined, do_insertions, bygroups
from pygments.token import Error, Text, \
Comment, Operator, Keyword, Name, String, Number, Generic
-from pygments.util import get_bool_opt, get_list_opt
+from pygments.util import get_bool_opt, get_list_opt, shebang_matches
__all__ = ['PythonLexer', 'PythonConsoleLexer', 'RubyLexer',
@@ -145,6 +145,9 @@ class PythonLexer(RegexLexer):
],
}
+ def analyse_text(text):
+ return shebang_matches(text, r'pythonw?(2\.\d)?')
+
class PythonConsoleLexer(Lexer):
"""
@@ -504,6 +507,9 @@ class RubyLexer(ExtendedRegexLexer):
}
tokens.update(gen_rubystrings_rules())
+ def analyse_text(text):
+ return shebang_matches(text, r'ruby(1\.\d)?')
+
class RubyConsoleLexer(Lexer):
"""
@@ -675,6 +681,9 @@ class PerlLexer(RegexLexer):
]
}
+ def analyse_text(text):
+ return shebang_matches(text, r'perl(\d\.\d\.\d)?')
+
class LuaLexer(RegexLexer):
name = 'Lua'
diff --git a/pygments/lexers/templates.py b/pygments/lexers/templates.py
index b8a14008..e9771af5 100644
--- a/pygments/lexers/templates.py
+++ b/pygments/lexers/templates.py
@@ -21,6 +21,7 @@ from pygments.lexer import \
Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, include, using
from pygments.token import \
Text, Comment, Operator, Keyword, Name, String, Number, Other
+from pygments.util import html_doctype_matches, looks_like_xml
__all__ = ['HtmlPhpLexer', 'XmlPhpLexer', 'CssPhpLexer',
'JavascriptPhpLexer', 'ErbLexer', 'RhtmlLexer',
@@ -110,6 +111,10 @@ class ErbLexer(Lexer):
except IndexError:
return
+ def analyse_text(text):
+ if '<%' in text and '%>' in text:
+ return 0.4
+
class SmartyLexer(RegexLexer):
name = 'Smarty'
@@ -118,7 +123,7 @@ class SmartyLexer(RegexLexer):
flags = re.MULTILINE | re.DOTALL
tokens = {
- # XXX: make marty delimiters customizable somehow
+ # XXX: make smarty delimiters customizable somehow
'root': [
(r'[^{]+', Other),
(r'(\{)(\*.*?\*)(\})',
@@ -145,6 +150,18 @@ class SmartyLexer(RegexLexer):
]
}
+ def analyse_text(text):
+ rv = 0.0
+ if re.search('\{if\s+.*?\}.*?\{/if\}', text):
+ rv += 0.15
+ if re.search('\{include\s+file=.*?\}', text):
+ rv += 0.15
+ if re.search('\{foreach\s+.*?\}.*?\{/foreach\}', text):
+ rv += 0.15
+ if re.search('\{\$.*?\}', text):
+ rv += 0.01
+ return rv
+
class DjangoLexer(RegexLexer):
name = 'django template'
@@ -188,6 +205,16 @@ class DjangoLexer(RegexLexer):
]
}
+ def analyse_text(text):
+ rv = 0.0
+ if re.search(r'\{\%\s*(block|extends)', text) is not None:
+ rv += 0.4
+ if re.search(r'\{\%\s*if\s*.*?\%\}', text) is not None:
+ rv += 0.1
+ if re.search(r'\{\{.*?\}\}', text) is not None:
+ rv += 0.1
+ return rv
+
class RhtmlLexer(DelegatingLexer):
name = 'RHTML'
@@ -197,6 +224,13 @@ class RhtmlLexer(DelegatingLexer):
def __init__(self, **options):
super(RhtmlLexer, self).__init__(HtmlLexer, ErbLexer, **options)
+ def analyse_text(text):
+ rv = ErbLexer.analyse_text(text) - 0.01
+ if html_doctype_matches(text):
+ # one more than the XmlErbLexer returns
+ rv += 0.5
+ return rv
+
class XmlErbLexer(DelegatingLexer):
name = 'XML+Ruby'
@@ -205,6 +239,12 @@ class XmlErbLexer(DelegatingLexer):
def __init__(self, **options):
super(XmlErbLexer, self).__init__(XmlLexer, ErbLexer, **options)
+ def analyse_text(text):
+ rv = ErbLexer.analyse_text(text) - 0.01
+ if looks_like_xml(text):
+ rv += 0.4
+ return rv
+
class CssErbLexer(DelegatingLexer):
name = 'CSS+Ruby'
@@ -231,6 +271,12 @@ class HtmlPhpLexer(DelegatingLexer):
def __init__(self, **options):
super(HtmlPhpLexer, self).__init__(HtmlLexer, PhpLexer, **options)
+ def analyse_text(text):
+ rv = PhpLexer.analyse_text(text) - 0.01
+ if html_doctype_matches(text):
+ rv += 0.5
+ return rv
+
class XmlPhpLexer(DelegatingLexer):
name = 'XML+PHP'
@@ -239,6 +285,12 @@ class XmlPhpLexer(DelegatingLexer):
def __init__(self, **options):
super(XmlPhpLexer, self).__init__(XmlLexer, PhpLexer, **options)
+ def analyse_text(text):
+ rv = PhpLexer.analyse_text(text) - 0.01
+ if looks_like_xml(text):
+ rv += 0.4
+ return rv
+
class CssPhpLexer(DelegatingLexer):
name = 'CSS+PHP'
@@ -264,6 +316,12 @@ class HtmlSmartyLexer(DelegatingLexer):
def __init__(self, **options):
super(HtmlSmartyLexer, self).__init__(HtmlLexer, SmartyLexer, **options)
+ def analyse_text(text):
+ rv = SmartyLexer.analyse_text(text) - 0.01
+ if html_doctype_matches(text):
+ rv += 0.5
+ return rv
+
class XmlSmartyLexer(DelegatingLexer):
name = 'XML+Smarty'
@@ -272,6 +330,12 @@ class XmlSmartyLexer(DelegatingLexer):
def __init__(self, **options):
super(XmlSmartyLexer, self).__init__(XmlLexer, SmartyLexer, **options)
+ def analyse_text(text):
+ rv = SmartyLexer.analyse_text(text) - 0.01
+ if looks_like_xml(text):
+ rv += 0.4
+ return rv
+
class CssSmartyLexer(DelegatingLexer):
name = 'CSS+Smarty'
@@ -297,6 +361,12 @@ class HtmlDjangoLexer(DelegatingLexer):
def __init__(self, **options):
super(HtmlDjangoLexer, self).__init__(HtmlLexer, DjangoLexer, **options)
+ def analyse_text(text):
+ rv = DjangoLexer.analyse_text(text) - 0.01
+ if html_doctype_matches(text):
+ rv += 0.5
+ return rv
+
class XmlDjangoLexer(DelegatingLexer):
name = 'XML+Django'
@@ -305,6 +375,12 @@ class XmlDjangoLexer(DelegatingLexer):
def __init__(self, **options):
super(XmlDjangoLexer, self).__init__(XmlLexer, DjangoLexer, **options)
+ def analyse_text(text):
+ rv = DjangoLexer.analyse_text(text) - 0.01
+ if looks_like_xml(text):
+ rv += 0.4
+ return rv
+
class CssDjangoLexer(DelegatingLexer):
name = 'CSS+Django'
diff --git a/pygments/lexers/web.py b/pygments/lexers/web.py
index 0b90e6cd..939144dd 100644
--- a/pygments/lexers/web.py
+++ b/pygments/lexers/web.py
@@ -18,7 +18,8 @@ except NameError:
from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, using
from pygments.token import \
Text, Comment, Operator, Keyword, Name, String, Number, Other
-from pygments.util import get_bool_opt, get_list_opt
+from pygments.util import get_bool_opt, get_list_opt, looks_like_xml, \
+ html_doctype_matches
__all__ = ['HtmlLexer', 'XmlLexer', 'JavascriptLexer', 'CssLexer',
@@ -217,6 +218,10 @@ class HtmlLexer(RegexLexer):
],
}
+ def analyse_text(text):
+ if html_doctype_matches(text):
+ return 0.5
+
class PhpLexer(RegexLexer):
name = 'PHP'
@@ -296,6 +301,13 @@ class PhpLexer(RegexLexer):
continue
yield index, token, value
+ def analyse_text(text):
+ rv = 0.0
+ for tag in '<?php', '?>':
+ if tag in text:
+ rv += 0.2
+ return rv
+
class XmlLexer(RegexLexer):
flags = re.MULTILINE | re.DOTALL
@@ -332,3 +344,7 @@ class XmlLexer(RegexLexer):
(r'[^\s>]+', String, '#pop'),
],
}
+
+ def analyse_text(text):
+ if looks_like_xml(text):
+ return 0.5
diff --git a/pygments/util.py b/pygments/util.py
index 4a427fb8..8ad994c6 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -9,6 +9,19 @@
:copyright: 2006 by Georg Brandl.
:license: GNU LGPL, see LICENSE for more details.
"""
+import re
+
+
+split_path_re = re.compile(r'[/\\ ]')
+doctype_lookup_re = re.compile(r'''(?smx)
+ (<\?.*?\?>)?\s*
+ <!DOCTYPE\s+(
+ [a-zA-Z_][a-zA-Z0-9]*\s+
+ [a-zA-Z_][a-zA-Z0-9]*\s+
+ "[^"]*")
+ [^>]+>
+''')
+tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</\1>(?uism)')
class OptionError(Exception):
@@ -49,3 +62,92 @@ def get_list_opt(options, optname, default=None):
raise OptionError('Invalid value %r for option %s; you '
'must give a list value' %
val, optname)
+
+
+def make_analysator(f):
+ """
+ Return a static text analysation function that
+ returns float values.
+ """
+ def text_analyse(text):
+ rv = f(text)
+ if not rv:
+ return 0.0
+ return min(1.0, max(0.0, float(rv)))
+ text_analyse.__doc__ = f.__doc__
+ return staticmethod(text_analyse)
+
+
+def shebang_matches(text, regex):
+ """
+ Check if the given regular expression matches the last part of the
+ shebang if one exists.
+
+ >>> from pygments.util import shebang_matches
+ >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?')
+ True
+ >>> shebang_matches('#!/usr/bin/python2.4', r'python(2\.\d)?')
+ True
+ >>> shebang_matches('#!/usr/bin/python-ruby', r'python(2\.\d)?')
+ False
+ >>> shebang_matches('#!/usr/bin/python/ruby', r'python(2\.\d)?')
+ False
+ >>> shebang_matches('#!/usr/bin/startsomethingwith python',
+ ... r'python(2\.\d)?')
+ True
+
+ It also checks for common windows executable file extensions::
+
+ >>> shebang_matches('#!C:\\Python2.4\\Python.exe', r'python(2\.\d)?')
+ True
+
+ Parameters (``'-f'`` or ``'--foo'`` are ignored so ``'perl'`` does
+ the same as ``'perl -e'``)
+
+ Note that this method automatically searches the whole string (eg:
+ the regular expression is wrapped in ``'^$'``)
+ """
+ if '\n' in text:
+ first_line = text[:text.index('\n')].lower()
+ else:
+ first_line = text.lower()
+ if first_line.startswith('#!'):
+ try:
+ found = [x for x in split_path_re.split(first_line[2:].strip())
+ if x and not x.startswith('-')][-1]
+ except IndexError:
+ return False
+ regex = re.compile('^%s(\.(exe|cmd|bat|bin))?$' % regex, re.IGNORECASE)
+ if regex.search(found) is not None:
+ return True
+ return False
+
+
+def doctype_matches(text, regex):
+ """
+ Check if the doctype matches a regular expression (if present).
+ Note that this method only checks the first part of a DOCTYPE.
+ eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
+ """
+ m = doctype_lookup_re.match(text)
+ if m is None:
+ return False
+ doctype = m.group(2)
+ return re.compile(regex).match(doctype.strip()) is not None
+
+
+def html_doctype_matches(text):
+ """
+ Check if the file looks like it has a html doctype
+ """
+ return doctype_matches(text, r'html\s+PUBLIC\s+"-//W3C//DTD X?HTML.*')
+
+
+def looks_like_xml(text):
+ """
+ Check if a doctype exists or if we have some tags
+ """
+ m = doctype_lookup_re.match(text)
+ if m is not None:
+ return True
+ return tag_re.search(text) is not None