8 files changed, 352 insertions, 57 deletions
diff --git a/docs/generate.py b/docs/generate.py
index abdc6316..8759042b 100644
--- a/docs/generate.py
+++ b/docs/generate.py
@@ -27,7 +27,7 @@ from pygments.lexers import get_lexer_by_name
 from pygments.formatters import HtmlFormatter
 
 
-PYGMENTS_FORMATTER = HtmlFormatter(style='friendly', cssclass='syntax')
+PYGMENTS_FORMATTER = HtmlFormatter(style='pastie', cssclass='syntax')
 
 USAGE = '''\
 Usage: %s <mode> <destination> [<source.txt> ...]
@@ -91,15 +91,15 @@ body {
 }
 
 h1 {
-        font-weight: normal;
-        font-size: 40px;
-        color: #09839A;
+    font-weight: normal;
+    font-size: 40px;
+    color: #09839A;
 }
 
 h2 {
-        font-weight: normal;
-        font-size: 30px;
-        color: #C73F00;
+    font-weight: normal;
+    font-size: 30px;
+    color: #C73F00;
 }
 
 h1.heading {
@@ -111,87 +111,87 @@ h2.subheading {
 }
 
 h3 {
-        margin-top: 30px;
+    margin-top: 30px;
 }
 
 table.docutils {
-        border-collapse: collapse;
-        border: 2px solid #aaa;
-        margin: 0.5em 1.5em 0.5em 1.5em;
+    border-collapse: collapse;
+    border: 2px solid #aaa;
+    margin: 0.5em 1.5em 0.5em 1.5em;
 }
 
 table.docutils td {
-        padding: 2px;
-        border: 1px solid #ddd;
+    padding: 2px;
+    border: 1px solid #ddd;
 }
 
 p, li, dd, dt, blockquote {
-        font-size: 15px;
-        color: #333;
+    font-size: 15px;
+    color: #333;
 }
 
 p {
-        line-height: 150%;
-        margin-bottom: 0;
-        margin-top: 10px;
+    line-height: 150%;
+    margin-bottom: 0;
+    margin-top: 10px;
 }
 
 hr {
-        border-top: 1px solid #ccc;
-        border-bottom: 0;
-        border-right: 0;
-        border-left: 0;
-        margin-bottom: 10px;
-        margin-top: 20px;
+    border-top: 1px solid #ccc;
+    border-bottom: 0;
+    border-right: 0;
+    border-left: 0;
+    margin-bottom: 10px;
+    margin-top: 20px;
 }
 
 dl {
-        margin-left: 10px;
+    margin-left: 10px;
 }
 
 li, dt {
-        margin-top: 5px;
+    margin-top: 5px;
 }
 
 dt {
-        font-weight: bold;
+    font-weight: bold;
 }
 
 th {
-        text-align: left;
+    text-align: left;
 }
 
 a {
-        color: #990000;
+    color: #990000;
 }
 
 a:hover {
-        color: #c73f00;
+    color: #c73f00;
 }
 
 pre {
-        background-color: #f0f0f0;
-        border-top: 1px solid #ccc;
-        border-bottom: 1px solid #ccc;
-        padding: 5px;
-        font-size: 13px;
-        font-family: Bitstream Vera Sans Mono,monospace;
+    background-color: #f9f9f9;
+    border-top: 1px solid #ccc;
+    border-bottom: 1px solid #ccc;
+    padding: 5px;
+    font-size: 13px;
+    font-family: Bitstream Vera Sans Mono,monospace;
 }
 
 tt {
-        font-size: 13px;
-        font-family: Bitstream Vera Sans Mono,monospace;
-        color: black;
-        padding: 1px 2px 1px 2px;
-        background-color: #f0f0f0;
+    font-size: 13px;
+    font-family: Bitstream Vera Sans Mono,monospace;
+    color: black;
+    padding: 1px 2px 1px 2px;
+    background-color: #f0f0f0;
 }
 
 cite {
-        /* abusing <cite>, it's generated by ReST for `x` */
-        font-size: 13px;
-        font-family: Bitstream Vera Sans Mono,monospace;
-        font-weight: bold;
-        font-style: normal;
+    /* abusing <cite>, it's generated by ReST for `x` */
+    font-size: 13px;
+    font-family: Bitstream Vera Sans Mono,monospace;
+    font-weight: bold;
+    font-style: normal;
 }
 
 #backlink {
diff --git a/docs/src/lexerdevelopment.txt b/docs/src/lexerdevelopment.txt
index c5d2921b..80822e1d 100644
--- a/docs/src/lexerdevelopment.txt
+++ b/docs/src/lexerdevelopment.txt
@@ -480,3 +480,34 @@ This might sound confusing (and it can really be). But it is needed, and for an
 example look at the Ruby lexer in `agile.py`_.
 
 .. _agile.py: http://trac.pocoo.org/repos/pygments/trunk/pygments/lexers/agile.py
+
+
+Filtering Token Streams
+=======================
+
+Some languages ship a lot of builtin functions (for example PHP). The total
+amount of those functions differs from system to system because not everybody
+has every extension installed. In the case of PHP there are over 3000 builtin
+functions. That's an incredible huge amount of functions, much more than you
+can put into a regular expression.
+
+But because only `Name` tokens can be function names it's solvable by overriding
+the ``get_tokens_unprocessed`` method. The following lexer subclasses the
+`PythonLexer` so that it highlights some additional names as pseudo keywords:
+
+.. sourcecode:: python
+
+    from pykleur.lexers.agile import PythonLexer
+    from pykleur.token import Name, Keyword
+
+    class MyPythonLexer(PythonLexer):
+        EXTRA_KEYWORDS = ['foo', 'bar', 'foobar', 'barfoo', 'spam', 'eggs']
+
+        def get_tokens_unprocessed(self, text):
+            for index, token, value in PythonLexer.get_tokens_unprocessed(self, text):
+                if token is Name and value in self.EXTRA_KEYWORDS:
+                    yield index, Keyword.Pseudo, value
+                else:
+                    yield index, token, value
+
+The `PhpLexer` and `LuaLexer` use this method to resolve builtin functions.
diff --git a/pygments/lexer.py b/pygments/lexer.py
index 6f57f31d..b5320ce2 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -1,23 +1,39 @@
 # -*- coding: utf-8 -*-
 """
-    pygments.lexer
-    ~~~~~~~~~~~~~~
+pygments.lexer
+~~~~~~~~~~~~~~
 
-    Base lexer classes.
+Base lexer classes.
 
-    :copyright: 2006 by Georg Brandl.
-    :license: GNU LGPL, see LICENSE for more details.
+:copyright: 2006 by Georg Brandl.
+:license: GNU LGPL, see LICENSE for more details.
 """
 import re
 
+from types import FunctionType
 from pygments.token import Error, Text, Other, _TokenType
-from pygments.util import get_bool_opt, get_int_opt
+from pygments.util import get_bool_opt, get_int_opt, make_analysator
 
 
 __all__ = ['Lexer', 'RegexLexer', 'ExtendedRegexLexer', 'DelegatingLexer',
            'LexerContext', 'include', 'flags', 'bygroups', 'using', 'this']
 
 
+_default_analyse = staticmethod(lambda x: 0.0)
+
+
+class LexerMeta(type):
+    """
+    This metaclass automagically converts ``analyse_text`` methods into
+    static methods which always return float values.
+    """
+
+    def __new__(cls, name, bases, d):
+        if 'analyse_text' in d:
+            d['analyse_text'] = make_analysator(d['analyse_text'])
+        return type.__new__(cls, name, bases, d)
+
+
 class Lexer(object):
     """
     Lexer for a specific language.
@@ -41,12 +57,28 @@ class Lexer(object):
     #: fn match rules
     filenames = []
 
+    __metaclass__ = LexerMeta
+
     def __init__(self, **options):
         self.options = options
         self.stripnl = get_bool_opt(options, 'stripnl', True)
         self.stripall = get_bool_opt(options, 'stripall', False)
         self.tabsize = get_int_opt(options, 'tabsize', 0)
 
+    def analyse_text(text):
+        """
+        Has to return an float between ``0`` and ``1`` that indicates
+        if a lexer wants to highighlight that. used by ``guess_lexer``.
+        If this method returns ``0`` it won't highlight it at all, if
+        it returns ``1`` highlighting with this lexer is guaranteed.
+
+        The `LexerMeta` metaclass automatically wraps this function so
+        that it works like a static method (no ``self`` or ``cls``
+        parameter) and the return value is automatically converted to
+        `float`. If the return value is an object that is boolean `False`
+        it's the same as if the return values was ``0.0``.
+        """
+
     def get_tokens(self, text):
         """
         Return an iterable of (tokentype, value) pairs generated from ``text``.
@@ -216,7 +248,7 @@ def using(_other, **kwargs):
     return callback
 
 
-class RegexLexerMeta(type):
+class RegexLexerMeta(LexerMeta):
     """
     Metaclass for RegexLexer, creates the self._tokens attribute from
     self.tokens on the first instantiation.
diff --git a/pygments/lexers/__init__.py b/pygments/lexers/__init__.py
index 99ec4a79..0f7c0483 100644
--- a/pygments/lexers/__init__.py
+++ b/pygments/lexers/__init__.py
@@ -16,7 +16,8 @@ from pygments.lexers._mapping import LEXERS
 from pygments.plugin import find_plugin_lexers
 
 
-__all__ = ['get_lexer_by_name', 'get_lexer_for_filename'] + LEXERS.keys()
+__all__ = ['get_lexer_by_name', 'get_lexer_for_filename',
+           'guess_lexer'] + LEXERS.keys()
 
 _lexer_cache = {}
 
@@ -68,6 +69,34 @@ def get_lexer_for_filename(fn, **options):
     raise ValueError('no lexer for filename %r found' % fn)
 
 
+def guess_lexer(text, **options):
+    """
+    Guess a lexer by strong distinctions in the text (eg, shebang).
+    """
+    best_lexer = [0.0, None]
+    # builtin lexers
+    for module_name, name, _, _ in LEXERS.itervalues():
+        if name not in _lexer_cache:
+            _load_lexers(module_name)
+        lexer = _lexer_cache[name]
+        rv = lexer.analyse_text(text)
+        if rv == 1.0:
+            return lexer(**options)
+        if rv > best_lexer[0]:
+            best_lexer[:] = (rv, lexer)
+    # plugin lexers
+    for lexer in find_plugin_lexers():
+        rv = lexer.analyse_text(text)
+        if rv == 1.0:
+            return lexer(**options)
+        if rv > best_lexer[0]:
+            best_lexer[:] = (rv, lexer)
+    if best_lexer[0] == 0.0 or best_lexer[1] is None:
+        from pygments.lexers.special import TextLexer
+        return TextLexer(**options)
+    return best_lexer[1](**options)
+
+
 class _automodule(types.ModuleType):
 
     def __getattr__(self, name):
diff --git a/pygments/lexers/agile.py b/pygments/lexers/agile.py
index 61292824..aec32450 100644
--- a/pygments/lexers/agile.py
+++ b/pygments/lexers/agile.py
@@ -19,7 +19,7 @@ from pygments.lexer import Lexer, RegexLexer, ExtendedRegexLexer, \
      LexerContext, include, combined, do_insertions, bygroups
 from pygments.token import Error, Text, \
      Comment, Operator, Keyword, Name, String, Number, Generic
-from pygments.util import get_bool_opt, get_list_opt
+from pygments.util import get_bool_opt, get_list_opt, shebang_matches
 
 
 __all__ = ['PythonLexer', 'PythonConsoleLexer', 'RubyLexer',
@@ -145,6 +145,9 @@ class PythonLexer(RegexLexer):
         ],
     }
 
+    def analyse_text(text):
+        return shebang_matches(text, r'pythonw?(2\.\d)?')
+
 
 class PythonConsoleLexer(Lexer):
     """
@@ -504,6 +507,9 @@ class RubyLexer(ExtendedRegexLexer):
     }
     tokens.update(gen_rubystrings_rules())
 
+    def analyse_text(text):
+        return shebang_matches(text, r'ruby(1\.\d)?')
+
 
 class RubyConsoleLexer(Lexer):
     """
@@ -675,6 +681,9 @@ class PerlLexer(RegexLexer):
         ]
     }
 
+    def analyse_text(text):
+        return shebang_matches(text, r'perl(\d\.\d\.\d)?')
+
 
 class LuaLexer(RegexLexer):
     name = 'Lua'
diff --git a/pygments/lexers/templates.py b/pygments/lexers/templates.py
index b8a14008..e9771af5 100644
--- a/pygments/lexers/templates.py
+++ b/pygments/lexers/templates.py
@@ -21,6 +21,7 @@ from pygments.lexer import \
      Lexer, DelegatingLexer, RegexLexer, do_insertions, bygroups, include, using
 from pygments.token import \
      Text, Comment, Operator, Keyword, Name, String, Number, Other
+from pygments.util import html_doctype_matches, looks_like_xml
 
 __all__ = ['HtmlPhpLexer', 'XmlPhpLexer', 'CssPhpLexer',
            'JavascriptPhpLexer', 'ErbLexer', 'RhtmlLexer',
@@ -110,6 +111,10 @@ class ErbLexer(Lexer):
         except IndexError:
             return
 
+    def analyse_text(text):
+        if '<%' in text and '%>' in text:
+            return 0.4
+
 
 class SmartyLexer(RegexLexer):
     name = 'Smarty'
@@ -118,7 +123,7 @@ class SmartyLexer(RegexLexer):
     flags = re.MULTILINE | re.DOTALL
 
     tokens = {
-        # XXX: make marty delimiters customizable somehow
+        # XXX: make smarty delimiters customizable somehow
         'root': [
             (r'[^{]+', Other),
             (r'(\{)(\*.*?\*)(\})',
@@ -145,6 +150,18 @@ class SmartyLexer(RegexLexer):
         ]
     }
 
+    def analyse_text(text):
+        rv = 0.0
+        if re.search('\{if\s+.*?\}.*?\{/if\}', text):
+            rv += 0.15
+        if re.search('\{include\s+file=.*?\}', text):
+            rv += 0.15
+        if re.search('\{foreach\s+.*?\}.*?\{/foreach\}', text):
+            rv += 0.15
+        if re.search('\{\$.*?\}', text):
+            rv += 0.01
+        return rv
+
 
 class DjangoLexer(RegexLexer):
     name = 'django template'
@@ -188,6 +205,16 @@ class DjangoLexer(RegexLexer):
         ]
     }
 
+    def analyse_text(text):
+        rv = 0.0
+        if re.search(r'\{\%\s*(block|extends)', text) is not None:
+            rv += 0.4
+        if re.search(r'\{\%\s*if\s*.*?\%\}', text) is not None:
+            rv += 0.1
+        if re.search(r'\{\{.*?\}\}', text) is not None:
+            rv += 0.1
+        return rv
+
 
 class RhtmlLexer(DelegatingLexer):
     name = 'RHTML'
@@ -197,6 +224,13 @@ class RhtmlLexer(DelegatingLexer):
     def __init__(self, **options):
         super(RhtmlLexer, self).__init__(HtmlLexer, ErbLexer, **options)
 
+    def analyse_text(text):
+        rv = ErbLexer.analyse_text(text) - 0.01
+        if html_doctype_matches(text):
+            # one more than the XmlErbLexer returns
+            rv += 0.5
+        return rv
+
 
 class XmlErbLexer(DelegatingLexer):
     name = 'XML+Ruby'
@@ -205,6 +239,12 @@ class XmlErbLexer(DelegatingLexer):
     def __init__(self, **options):
         super(XmlErbLexer, self).__init__(XmlLexer, ErbLexer, **options)
 
+    def analyse_text(text):
+        rv = ErbLexer.analyse_text(text) - 0.01
+        if looks_like_xml(text):
+            rv += 0.4
+        return rv
+
 
 class CssErbLexer(DelegatingLexer):
     name = 'CSS+Ruby'
@@ -231,6 +271,12 @@ class HtmlPhpLexer(DelegatingLexer):
     def __init__(self, **options):
         super(HtmlPhpLexer, self).__init__(HtmlLexer, PhpLexer, **options)
 
+    def analyse_text(text):
+        rv = PhpLexer.analyse_text(text) - 0.01
+        if html_doctype_matches(text):
+            rv += 0.5
+        return rv
+
 
 class XmlPhpLexer(DelegatingLexer):
     name = 'XML+PHP'
@@ -239,6 +285,12 @@ class XmlPhpLexer(DelegatingLexer):
     def __init__(self, **options):
         super(XmlPhpLexer, self).__init__(XmlLexer, PhpLexer, **options)
 
+    def analyse_text(text):
+        rv = PhpLexer.analyse_text(text) - 0.01
+        if looks_like_xml(text):
+            rv += 0.4
+        return rv
+
 
 class CssPhpLexer(DelegatingLexer):
     name = 'CSS+PHP'
@@ -264,6 +316,12 @@ class HtmlSmartyLexer(DelegatingLexer):
     def __init__(self, **options):
         super(HtmlSmartyLexer, self).__init__(HtmlLexer, SmartyLexer, **options)
 
+    def analyse_text(text):
+        rv = SmartyLexer.analyse_text(text) - 0.01
+        if html_doctype_matches(text):
+            rv += 0.5
+        return rv
+
 
 class XmlSmartyLexer(DelegatingLexer):
     name = 'XML+Smarty'
@@ -272,6 +330,12 @@ class XmlSmartyLexer(DelegatingLexer):
     def __init__(self, **options):
         super(XmlSmartyLexer, self).__init__(XmlLexer, SmartyLexer, **options)
 
+    def analyse_text(text):
+        rv = SmartyLexer.analyse_text(text) - 0.01
+        if looks_like_xml(text):
+            rv += 0.4
+        return rv
+
 
 class CssSmartyLexer(DelegatingLexer):
     name = 'CSS+Smarty'
@@ -297,6 +361,12 @@ class HtmlDjangoLexer(DelegatingLexer):
     def __init__(self, **options):
         super(HtmlDjangoLexer, self).__init__(HtmlLexer, DjangoLexer, **options)
 
+    def analyse_text(text):
+        rv = DjangoLexer.analyse_text(text) - 0.01
+        if html_doctype_matches(text):
+            rv += 0.5
+        return rv
+
 
 class XmlDjangoLexer(DelegatingLexer):
     name = 'XML+Django'
@@ -305,6 +375,12 @@ class XmlDjangoLexer(DelegatingLexer):
     def __init__(self, **options):
         super(XmlDjangoLexer, self).__init__(XmlLexer, DjangoLexer, **options)
 
+    def analyse_text(text):
+        rv = DjangoLexer.analyse_text(text) - 0.01
+        if looks_like_xml(text):
+            rv += 0.4
+        return rv
+
 
 class CssDjangoLexer(DelegatingLexer):
     name = 'CSS+Django'
diff --git a/pygments/lexers/web.py b/pygments/lexers/web.py
index 0b90e6cd..939144dd 100644
--- a/pygments/lexers/web.py
+++ b/pygments/lexers/web.py
@@ -18,7 +18,8 @@ except NameError:
 from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, using
 from pygments.token import \
      Text, Comment, Operator, Keyword, Name, String, Number, Other
-from pygments.util import get_bool_opt, get_list_opt
+from pygments.util import get_bool_opt, get_list_opt, looks_like_xml, \
+                          html_doctype_matches
 
 
 __all__ = ['HtmlLexer', 'XmlLexer', 'JavascriptLexer', 'CssLexer',
@@ -217,6 +218,10 @@ class HtmlLexer(RegexLexer):
         ],
     }
 
+    def analyse_text(text):
+        if html_doctype_matches(text):
+            return 0.5
+
 
 class PhpLexer(RegexLexer):
     name = 'PHP'
@@ -296,6 +301,13 @@ class PhpLexer(RegexLexer):
                     continue
             yield index, token, value
 
+    def analyse_text(text):
+        rv = 0.0
+        for tag in '<?php', '?>':
+            if tag in text:
+                rv += 0.2
+        return rv
+
 
 class XmlLexer(RegexLexer):
     flags = re.MULTILINE | re.DOTALL
@@ -332,3 +344,7 @@ class XmlLexer(RegexLexer):
             (r'[^\s>]+', String, '#pop'),
         ],
     }
+
+    def analyse_text(text):
+        if looks_like_xml(text):
+            return 0.5
diff --git a/pygments/util.py b/pygments/util.py
index 4a427fb8..8ad994c6 100644
--- a/pygments/util.py
+++ b/pygments/util.py
@@ -9,6 +9,19 @@
     :copyright: 2006 by Georg Brandl.
     :license: GNU LGPL, see LICENSE for more details.
 """
+import re
+
+
+split_path_re = re.compile(r'[/\\ ]')
+doctype_lookup_re = re.compile(r'''(?smx)
+    (<\?.*?\?>)?\s*
+    <!DOCTYPE\s+(
+     [a-zA-Z_][a-zA-Z0-9]*\s+
+     [a-zA-Z_][a-zA-Z0-9]*\s+
+     "[^"]*")
+     [^>]+>
+''')
+tag_re = re.compile(r'<(.+?)(\s.*?)?>.*?</\1>(?uism)')
 
 
 class OptionError(Exception):
@@ -49,3 +62,92 @@ def get_list_opt(options, optname, default=None):
         raise OptionError('Invalid value %r for option %s; you '
                           'must give a list value' %
                           val, optname)
+
+
+def make_analysator(f):
+    """
+    Return a static text analysation function that
+    returns float values.
+    """
+    def text_analyse(text):
+        rv = f(text)
+        if not rv:
+            return 0.0
+        return min(1.0, max(0.0, float(rv)))
+    text_analyse.__doc__ = f.__doc__
+    return staticmethod(text_analyse)
+
+
+def shebang_matches(text, regex):
+    """
+    Check if the given regular expression matches the last part of the
+    shebang if one exists.
+
+        >>> from pygments.util import shebang_matches
+        >>> shebang_matches('#!/usr/bin/env python', r'python(2\.\d)?')
+        True
+        >>> shebang_matches('#!/usr/bin/python2.4', r'python(2\.\d)?')
+        True
+        >>> shebang_matches('#!/usr/bin/python-ruby', r'python(2\.\d)?')
+        False
+        >>> shebang_matches('#!/usr/bin/python/ruby', r'python(2\.\d)?')
+        False
+        >>> shebang_matches('#!/usr/bin/startsomethingwith python',
+        ...                 r'python(2\.\d)?')
+        True
+
+    It also checks for common windows executable file extensions::
+
+        >>> shebang_matches('#!C:\\Python2.4\\Python.exe', r'python(2\.\d)?')
+        True
+
+    Parameters (``'-f'`` or ``'--foo'`` are ignored so ``'perl'`` does
+    the same as ``'perl -e'``)
+
+    Note that this method automatically searches the whole string (eg:
+    the regular expression is wrapped in ``'^$'``)
+    """
+    if '\n' in text:
+        first_line = text[:text.index('\n')].lower()
+    else:
+        first_line = text.lower()
+    if first_line.startswith('#!'):
+        try:
+            found = [x for x in split_path_re.split(first_line[2:].strip())
+                     if x and not x.startswith('-')][-1]
+        except IndexError:
+            return False
+        regex = re.compile('^%s(\.(exe|cmd|bat|bin))?$' % regex, re.IGNORECASE)
+        if regex.search(found) is not None:
+            return True
+    return False
+
+
+def doctype_matches(text, regex):
+    """
+    Check if the doctype matches a regular expression (if present).
+    Note that this method only checks the first part of a DOCTYPE.
+    eg: 'html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"'
+    """
+    m = doctype_lookup_re.match(text)
+    if m is None:
+        return False
+    doctype = m.group(2)
+    return re.compile(regex).match(doctype.strip()) is not None
+
+
+def html_doctype_matches(text):
+    """
+    Check if the file looks like it has a html doctype
+    """
+    return doctype_matches(text, r'html\s+PUBLIC\s+"-//W3C//DTD X?HTML.*')
+
+
+def looks_like_xml(text):
+    """
+    Check if a doctype exists or if we have some tags
+    """
+    m = doctype_lookup_re.match(text)
+    if m is not None:
+        return True
+    return tag_re.search(text) is not None