summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--TODO14
-rw-r--r--docs/src/api.txt42
-rw-r--r--docs/src/quickstart.txt53
-rw-r--r--docs/src/tokens.txt3
-rw-r--r--pygments/lexer.py7
-rw-r--r--pygments/lexers/__init__.py30
-rw-r--r--pygments/token.py9
7 files changed, 126 insertions, 32 deletions
diff --git a/TODO b/TODO
index 13337bb8..6c565025 100644
--- a/TODO
+++ b/TODO
@@ -4,22 +4,15 @@ Todo
before 0.5
----------
-- add mimetype attributes
- improve guess_lexer heuristics (esp. for template langs)
- more unit tests
-- documentation for new features (guessing)
-
- goto label HL support for languages that use it
-- tell the PHP and DelphiLexer how to differ between Operators and
- text.
-
for 0.6
-------
-- allow multiple token types per regex (done, but awkwardly)
- allow "overlay" token types (e.g. Diff + X)
- highlight specials: nth line, a word etc.
- dhtml: overlays toggleable by javascript
@@ -43,6 +36,9 @@ for 0.6
* tcl
* (la)tex
+- tell the PHP and DelphiLexer how to differ between Operators and
+ text.
+
- add a `Punctuation` token type for symbols that are not text
but also not a symbol (blocks in ruby etc)
@@ -54,8 +50,8 @@ for 0.6
- docstrings?
-for 0.7
--------
+for 0.7 / later
+---------------
- moin parser
diff --git a/docs/src/api.txt b/docs/src/api.txt
index 90317147..1d32c59f 100644
--- a/docs/src/api.txt
+++ b/docs/src/api.txt
@@ -43,6 +43,25 @@ def `get_lexer_for_filename(fn, **options):`
Will raise `ValueError` if no lexer for that filename is found.
+def `get_lexer_for_mimetype(mime, **options):`
+ Return a `Lexer` subclass instance that has `mime` in its mimetype
+ list. The lexer is given the `options` at its instantiation.
+
+ Will raise `ValueError` if not lexer for that mimetype is found.
+
+def `guess_lexer(text, **options):`
+ Return a `Lexer` subclass instance that's guessed from the text
+ in `text`. For that, the `analyze_text()` method of every known
+ lexer class is called with the text as argument, and the lexer
+ which returned the highest value will be instantiated and returned.
+
+ `ValueError` is raised if no lexer thinks it can handle the content.
+
+def `guess_lexer_for_filename(text, filename, **options):`
+ As `guess_lexer()`, but only lexers which have a pattern in `filenames`
+ or `alias_filenames` that matches `filename` are taken into consideration.
+
+ `ValueError` is raised if no lexer thinks it can handle the content.
Functions from `pygments.formatters`:
@@ -101,6 +120,12 @@ def `get_tokens_unprocessed(self, text):`
This method must be overridden by subclasses.
+def `analyze_text(text):`
+ A static method which is called for lexer guessing. It should analyze
+ the text and return a float in the range from ``0.0`` to ``1.0``.
+ If it returns ``0.0``, the lexer will not be selected as the most
+ probable one, if it returns ``1.0``, it will be selected immediately.
+
For a list of known tokens have a look at the `Tokens`_ page.
The lexer also recognizes the following attributes that are used by the
@@ -114,8 +139,21 @@ builtin lookup mechanism.
the lexer from a list.
`filenames`
- A list of `fnmatch` patterns that can be used to find a lexer for
- a given filename.
+ A list of `fnmatch` patterns that match filenames which contain
+ content for this lexer. The patterns in this list should be unique among
+ all lexers.
+
+`alias_filenames`
+ A list of `fnmatch` patterns that match filenames which may or may not
+ contain content for this lexer. This list is used by the
+ `guess_lexer_for_filename()` function, to determine which lexers are
+ then included in guessing the correct one. That means that e.g. every
+ lexer for HTML and a template language should include ``\*.html`` in
+ this list.
+
+`mimetypes`
+ A list of MIME types for content that can be lexed with this
+ lexer.
.. _Tokens: tokens.txt
diff --git a/docs/src/quickstart.txt b/docs/src/quickstart.txt
index 5b8cdfaf..0d9a62bc 100644
--- a/docs/src/quickstart.txt
+++ b/docs/src/quickstart.txt
@@ -87,17 +87,58 @@ one of the following methods:
.. sourcecode:: pycon
- >>> from pygments.lexers import get_lexer_by_name, get_lexer_for_filename
+ >>> from pygments.lexers import (get_lexer_by_name,
+ ... get_lexer_for_filename, get_lexer_for_mimetype)
+
>>> get_lexer_by_name('python')
- <pygments.lexers.agile.PythonLexer object at 0xb7bd6d0c>
- >>> get_lexer_for_filename('spam.py')
- <pygments.lexers.agile.PythonLexer object at 0xb7bd6b2c>
+ <pygments.lexers.PythonLexer>
+
+ >>> get_lexer_for_filename('spam.rb')
+ <pygments.lexers.RubyLexer>
+
+ >>> get_lexer_for_mimetype('text/x-perl')
+ <pygments.lexers.PerlLexer>
+
+All these functions accept keyword arguments; they will be passed to the lexer
+as options.
-The same API is available for formatters: use `get_formatter_by_name` and
-`get_formatter_for_filename` from the `pygments.formatters` module
+A similar API is available for formatters: use `get_formatter_by_name()` and
+`get_formatter_for_filename()` from the `pygments.formatters` module
for this purpose.
+Guessing lexers
+===============
+
+If you don't know the content of the file, or you want to highlight a file
+whose extension is ambiguous, such as ``.html`` (which could contain plain HTML
+or some template tags), use these functions:
+
+.. sourcecode:: pycon
+
+ >>> from pygments.lexers import guess_lexer, guess_lexer_for_filename
+
+ >>> guess_lexer('#!/usr/bin/python\nprint "Hello World!"')
+ <pygments.lexers.PythonLexer>
+
+ >>> guess_lexer_for_filename('test.py', 'print "Hello World!"')
+ <pygments.lexers.PythonLexer>
+
+`guess_lexer()` passes the given content to the lexer classes' `analyze_text()`
+method and returns the one for which it returns the highest number.
+
+All lexers have two different filename pattern lists: the primary and the
+secondary one. The `get_lexer_for_filename()` function only uses the primary
+list, whose entries are supposed to be unique among all lexers.
+`guess_lexer_for_filename()`, however, will first loop through all lexers and
+look at the primary and secondary filename patterns if the filename matches.
+If only one lexer matches, it is returned, else the guessing mechanism of
+`guess_lexer()` is used with the matching lexers.
+
+As usual, keyword arguments to these functions are given to the created lexer
+as options.
+
+
Command line usage
==================
diff --git a/docs/src/tokens.txt b/docs/src/tokens.txt
index 47d8feea..daaf1eca 100644
--- a/docs/src/tokens.txt
+++ b/docs/src/tokens.txt
@@ -56,6 +56,9 @@ Normally you just create token types using the already defined aliases. For each
of those token aliases, a number of subtypes exists (excluding the special tokens
`Token.Text`, `Token.Error` and `Token.Other`)
+The `is_token_subtype()` function in the `pygments.token` module can be used to
+test if a token type is a subtype of another (such as `Name.Tag` and `Name`).
+
Keyword Tokens
==============
diff --git a/pygments/lexer.py b/pygments/lexer.py
index e5264888..8f9c7d7b 100644
--- a/pygments/lexer.py
+++ b/pygments/lexer.py
@@ -79,6 +79,13 @@ class Lexer(object):
self.stripall = get_bool_opt(options, 'stripall', False)
self.tabsize = get_int_opt(options, 'tabsize', 0)
+ def __repr__(self):
+ if self.options:
+ return '<pygments.lexers.%s with %r>' % (self.__class__.__name__,
+ self.options)
+ else:
+ return '<pygments.lexers.%s>' % self.__class__.__name__
+
def analyse_text(text):
"""
Has to return a float between ``0`` and ``1`` that indicates
diff --git a/pygments/lexers/__init__.py b/pygments/lexers/__init__.py
index f8e6c90a..529e3cb3 100644
--- a/pygments/lexers/__init__.py
+++ b/pygments/lexers/__init__.py
@@ -37,18 +37,6 @@ def _load_lexers(module_name):
_lexer_cache[cls.name] = cls
-def _iter_lexers():
- """
- Returns an iterator over all lexer classes.
- """
- for module_name, name, _, _ in LEXERS.itervalues():
- if name not in _lexer_cache:
- _load_lexers(module_name)
- yield _lexer_cache[name]
- for lexer in find_plugin_lexers():
- yield lexer
-
-
def get_lexer_by_name(_alias, **options):
"""
Get a lexer by an alias.
@@ -99,6 +87,18 @@ def get_lexer_for_mimetype(_mime, **options):
raise ValueError('no lexer for mimetype %r found' % _mime)
+def _iter_lexerclasses():
+ """
+ Returns an iterator over all lexer classes.
+ """
+ for module_name, name, _, _ in LEXERS.itervalues():
+ if name not in _lexer_cache:
+ _load_lexers(module_name)
+ yield _lexer_cache[name]
+ for lexer in find_plugin_lexers():
+ yield lexer
+
+
def guess_lexer_for_filename(_fn, _text, **options):
"""
Lookup all lexers that handle those filenames primary (``filenames``)
@@ -118,7 +118,7 @@ def guess_lexer_for_filename(_fn, _text, **options):
fn = basename(_fn)
primary = None
matching_lexers = set()
- for lexer in _iter_lexers():
+ for lexer in _iter_lexerclasses():
for filename in lexer.filenames:
if fnmatch.fnmatch(fn, filename):
matching_lexers.add(lexer)
@@ -129,7 +129,7 @@ def guess_lexer_for_filename(_fn, _text, **options):
if not matching_lexers:
raise ValueError('no lexer for filename %r found' % fn)
if len(matching_lexers) == 1:
- return iter(matching_lexers).next()
+ return matching_lexers.pop()(**options)
result = []
for lexer in matching_lexers:
rv = lexer.analyse_text(_text)
@@ -149,7 +149,7 @@ def guess_lexer(_text, **options):
#XXX: i (mitsuhiko) would like to drop this function in favor of the
# better guess_lexer_for_filename function.
best_lexer = [0.0, None]
- for lexer in _iter_lexers():
+ for lexer in _iter_lexerclasses():
rv = lexer.analyse_text(text)
if rv == 1.0:
return lexer(**options)
diff --git a/pygments/token.py b/pygments/token.py
index 7ec93262..bbbcc648 100644
--- a/pygments/token.py
+++ b/pygments/token.py
@@ -57,6 +57,15 @@ Comment = Token.Comment
Generic = Token.Generic
+def is_token_subtype(ttype, other):
+ """Return True if ``ttype`` is a subtype of ``other``."""
+ while ttype is not None:
+ if ttype == other:
+ return True
+ ttype = ttype.parent
+ return False
+
+
# Map standard token types to short names, used in CSS class naming.
# If you add a new item, please be sure to run this file to perform
# a consistency check for duplicate values.