diff options
author | gbrandl <devnull@localhost> | 2006-10-28 22:09:41 +0200 |
---|---|---|
committer | gbrandl <devnull@localhost> | 2006-10-28 22:09:41 +0200 |
commit | 7f8c2354a497187f33e7c7fc7e8bcdc5f5a8b7a5 (patch) | |
tree | aa49a45ff8216f4502eedba057d621aa0e22589b | |
parent | d336a1d4c0375b2b7cf266b6073932e3abb0968d (diff) | |
download | pygments-7f8c2354a497187f33e7c7fc7e8bcdc5f5a8b7a5.tar.gz |
[svn] Some fixes, add docs for new features.
-rw-r--r-- | TODO | 14 | ||||
-rw-r--r-- | docs/src/api.txt | 42 | ||||
-rw-r--r-- | docs/src/quickstart.txt | 53 | ||||
-rw-r--r-- | docs/src/tokens.txt | 3 | ||||
-rw-r--r-- | pygments/lexer.py | 7 | ||||
-rw-r--r-- | pygments/lexers/__init__.py | 30 | ||||
-rw-r--r-- | pygments/token.py | 9 |
7 files changed, 126 insertions, 32 deletions
@@ -4,22 +4,15 @@ Todo before 0.5 ---------- -- add mimetype attributes - improve guess_lexer heuristics (esp. for template langs) - more unit tests -- documentation for new features (guessing) - - goto label HL support for languages that use it -- tell the PHP and DelphiLexer how to differ between Operators and - text. - for 0.6 ------- -- allow multiple token types per regex (done, but awkwardly) - allow "overlay" token types (e.g. Diff + X) - highlight specials: nth line, a word etc. - dhtml: overlays toggleable by javascript @@ -43,6 +36,9 @@ for 0.6 * tcl * (la)tex +- tell the PHP and DelphiLexer how to differ between Operators and + text. + - add a `Punctuation` token type for symbols that are not text but also not a symbol (blocks in ruby etc) @@ -54,8 +50,8 @@ for 0.6 - docstrings? -for 0.7 -------- +for 0.7 / later +--------------- - moin parser diff --git a/docs/src/api.txt b/docs/src/api.txt index 90317147..1d32c59f 100644 --- a/docs/src/api.txt +++ b/docs/src/api.txt @@ -43,6 +43,25 @@ def `get_lexer_for_filename(fn, **options):` Will raise `ValueError` if no lexer for that filename is found. +def `get_lexer_for_mimetype(mime, **options):` + Return a `Lexer` subclass instance that has `mime` in its mimetype + list. The lexer is given the `options` at its instantiation. + + Will raise `ValueError` if not lexer for that mimetype is found. + +def `guess_lexer(text, **options):` + Return a `Lexer` subclass instance that's guessed from the text + in `text`. For that, the `analyze_text()` method of every known + lexer class is called with the text as argument, and the lexer + which returned the highest value will be instantiated and returned. + + `ValueError` is raised if no lexer thinks it can handle the content. + +def `guess_lexer_for_filename(text, filename, **options):` + As `guess_lexer()`, but only lexers which have a pattern in `filenames` + or `alias_filenames` that matches `filename` are taken into consideration. + + `ValueError` is raised if no lexer thinks it can handle the content. Functions from `pygments.formatters`: @@ -101,6 +120,12 @@ def `get_tokens_unprocessed(self, text):` This method must be overridden by subclasses. +def `analyze_text(text):` + A static method which is called for lexer guessing. It should analyze + the text and return a float in the range from ``0.0`` to ``1.0``. + If it returns ``0.0``, the lexer will not be selected as the most + probable one, if it returns ``1.0``, it will be selected immediately. + For a list of known tokens have a look at the `Tokens`_ page. The lexer also recognizes the following attributes that are used by the @@ -114,8 +139,21 @@ builtin lookup mechanism. the lexer from a list. `filenames` - A list of `fnmatch` patterns that can be used to find a lexer for - a given filename. + A list of `fnmatch` patterns that match filenames which contain + content for this lexer. The patterns in this list should be unique among + all lexers. + +`alias_filenames` + A list of `fnmatch` patterns that match filenames which may or may not + contain content for this lexer. This list is used by the + `guess_lexer_for_filename()` function, to determine which lexers are + then included in guessing the correct one. That means that e.g. every + lexer for HTML and a template language should include ``\*.html`` in + this list. + +`mimetypes` + A list of MIME types for content that can be lexed with this + lexer. .. _Tokens: tokens.txt diff --git a/docs/src/quickstart.txt b/docs/src/quickstart.txt index 5b8cdfaf..0d9a62bc 100644 --- a/docs/src/quickstart.txt +++ b/docs/src/quickstart.txt @@ -87,17 +87,58 @@ one of the following methods: .. sourcecode:: pycon - >>> from pygments.lexers import get_lexer_by_name, get_lexer_for_filename + >>> from pygments.lexers import (get_lexer_by_name, + ... get_lexer_for_filename, get_lexer_for_mimetype) + >>> get_lexer_by_name('python') - <pygments.lexers.agile.PythonLexer object at 0xb7bd6d0c> - >>> get_lexer_for_filename('spam.py') - <pygments.lexers.agile.PythonLexer object at 0xb7bd6b2c> + <pygments.lexers.PythonLexer> + + >>> get_lexer_for_filename('spam.rb') + <pygments.lexers.RubyLexer> + + >>> get_lexer_for_mimetype('text/x-perl') + <pygments.lexers.PerlLexer> + +All these functions accept keyword arguments; they will be passed to the lexer +as options. -The same API is available for formatters: use `get_formatter_by_name` and -`get_formatter_for_filename` from the `pygments.formatters` module +A similar API is available for formatters: use `get_formatter_by_name()` and +`get_formatter_for_filename()` from the `pygments.formatters` module for this purpose. +Guessing lexers +=============== + +If you don't know the content of the file, or you want to highlight a file +whose extension is ambiguous, such as ``.html`` (which could contain plain HTML +or some template tags), use these functions: + +.. sourcecode:: pycon + + >>> from pygments.lexers import guess_lexer, guess_lexer_for_filename + + >>> guess_lexer('#!/usr/bin/python\nprint "Hello World!"') + <pygments.lexers.PythonLexer> + + >>> guess_lexer_for_filename('test.py', 'print "Hello World!"') + <pygments.lexers.PythonLexer> + +`guess_lexer()` passes the given content to the lexer classes' `analyze_text()` +method and returns the one for which it returns the highest number. + +All lexers have two different filename pattern lists: the primary and the +secondary one. The `get_lexer_for_filename()` function only uses the primary +list, whose entries are supposed to be unique among all lexers. +`guess_lexer_for_filename()`, however, will first loop through all lexers and +look at the primary and secondary filename patterns if the filename matches. +If only one lexer matches, it is returned, else the guessing mechanism of +`guess_lexer()` is used with the matching lexers. + +As usual, keyword arguments to these functions are given to the created lexer +as options. + + Command line usage ================== diff --git a/docs/src/tokens.txt b/docs/src/tokens.txt index 47d8feea..daaf1eca 100644 --- a/docs/src/tokens.txt +++ b/docs/src/tokens.txt @@ -56,6 +56,9 @@ Normally you just create token types using the already defined aliases. For each of those token aliases, a number of subtypes exists (excluding the special tokens `Token.Text`, `Token.Error` and `Token.Other`) +The `is_token_subtype()` function in the `pygments.token` module can be used to +test if a token type is a subtype of another (such as `Name.Tag` and `Name`). + Keyword Tokens ============== diff --git a/pygments/lexer.py b/pygments/lexer.py index e5264888..8f9c7d7b 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -79,6 +79,13 @@ class Lexer(object): self.stripall = get_bool_opt(options, 'stripall', False) self.tabsize = get_int_opt(options, 'tabsize', 0) + def __repr__(self): + if self.options: + return '<pygments.lexers.%s with %r>' % (self.__class__.__name__, + self.options) + else: + return '<pygments.lexers.%s>' % self.__class__.__name__ + def analyse_text(text): """ Has to return a float between ``0`` and ``1`` that indicates diff --git a/pygments/lexers/__init__.py b/pygments/lexers/__init__.py index f8e6c90a..529e3cb3 100644 --- a/pygments/lexers/__init__.py +++ b/pygments/lexers/__init__.py @@ -37,18 +37,6 @@ def _load_lexers(module_name): _lexer_cache[cls.name] = cls -def _iter_lexers(): - """ - Returns an iterator over all lexer classes. - """ - for module_name, name, _, _ in LEXERS.itervalues(): - if name not in _lexer_cache: - _load_lexers(module_name) - yield _lexer_cache[name] - for lexer in find_plugin_lexers(): - yield lexer - - def get_lexer_by_name(_alias, **options): """ Get a lexer by an alias. @@ -99,6 +87,18 @@ def get_lexer_for_mimetype(_mime, **options): raise ValueError('no lexer for mimetype %r found' % _mime) +def _iter_lexerclasses(): + """ + Returns an iterator over all lexer classes. + """ + for module_name, name, _, _ in LEXERS.itervalues(): + if name not in _lexer_cache: + _load_lexers(module_name) + yield _lexer_cache[name] + for lexer in find_plugin_lexers(): + yield lexer + + def guess_lexer_for_filename(_fn, _text, **options): """ Lookup all lexers that handle those filenames primary (``filenames``) @@ -118,7 +118,7 @@ def guess_lexer_for_filename(_fn, _text, **options): fn = basename(_fn) primary = None matching_lexers = set() - for lexer in _iter_lexers(): + for lexer in _iter_lexerclasses(): for filename in lexer.filenames: if fnmatch.fnmatch(fn, filename): matching_lexers.add(lexer) @@ -129,7 +129,7 @@ def guess_lexer_for_filename(_fn, _text, **options): if not matching_lexers: raise ValueError('no lexer for filename %r found' % fn) if len(matching_lexers) == 1: - return iter(matching_lexers).next() + return matching_lexers.pop()(**options) result = [] for lexer in matching_lexers: rv = lexer.analyse_text(_text) @@ -149,7 +149,7 @@ def guess_lexer(_text, **options): #XXX: i (mitsuhiko) would like to drop this function in favor of the # better guess_lexer_for_filename function. best_lexer = [0.0, None] - for lexer in _iter_lexers(): + for lexer in _iter_lexerclasses(): rv = lexer.analyse_text(text) if rv == 1.0: return lexer(**options) diff --git a/pygments/token.py b/pygments/token.py index 7ec93262..bbbcc648 100644 --- a/pygments/token.py +++ b/pygments/token.py @@ -57,6 +57,15 @@ Comment = Token.Comment Generic = Token.Generic +def is_token_subtype(ttype, other): + """Return True if ``ttype`` is a subtype of ``other``.""" + while ttype is not None: + if ttype == other: + return True + ttype = ttype.parent + return False + + # Map standard token types to short names, used in CSS class naming. # If you add a new item, please be sure to run this file to perform # a consistency check for duplicate values. |