diff options
-rw-r--r-- | docs/src/cmdline.txt | 20 | ||||
-rw-r--r-- | docs/src/unicode.txt | 10 | ||||
-rw-r--r-- | pygments/cmdline.py | 12 | ||||
-rw-r--r-- | pygments/formatters/terminal256.py | 7 | ||||
-rw-r--r-- | pygments/lexer.py | 6 |
5 files changed, 43 insertions, 12 deletions
diff --git a/docs/src/cmdline.txt b/docs/src/cmdline.txt index 494b0c5e..7e5143fa 100644 --- a/docs/src/cmdline.txt +++ b/docs/src/cmdline.txt @@ -81,4 +81,24 @@ will print the help for the HTML formatter, while :: will print the help for the Python lexer, etc. +A note on encodings +------------------- + +Pygments tries to be smart regarding encodings in the formatting process: + +* If you give an ``encoding`` option, it will be used as the input and + output encoding. + +* If you give an ``outencoding`` option, it will override ``encoding`` + as the output encoding. + +* If you don't give an encoding and have given an output file, the default + encoding for lexer and formatter is ``latin1`` (which will pass through + all non-ASCII characters). + +* If you don't give an encoding and haven't given an output file (that means + output is written to the console), the default encoding for lexer and + formatter is the terminal encoding (`sys.stdout.encoding`). + + .. _a particular formatter: formatters.txt diff --git a/docs/src/unicode.txt b/docs/src/unicode.txt index 8184e2b2..dc6394a9 100644 --- a/docs/src/unicode.txt +++ b/docs/src/unicode.txt @@ -34,10 +34,12 @@ source and the output stream does not accept Unicode written to it!** This is the case for all regular files and for terminals. Note: The Terminal formatter tries to be smart: if its output stream has an -`encoding` attribute, and you haven't set the option, -it will encode any Unicode string with this encoding -before writing it. This is the case for `sys.stdout`, for example. The other -formatters don't have that behavior. +`encoding` attribute, and you haven't set the option, it will encode any +Unicode string with this encoding before writing it. This is the case for +`sys.stdout`, for example. The other formatters don't have that behavior. + +Another note: If you call Pygments via the command line (`pygmentize`), +encoding is handled differently, see `the command line docs <cmdline.txt>`_. *New in Pygments 0.7*: the formatters now also accept an `outencoding` option which will override the `encoding` option if given. This makes it possible to diff --git a/pygments/cmdline.py b/pygments/cmdline.py index 0801eab0..5d0ba922 100644 --- a/pygments/cmdline.py +++ b/pygments/cmdline.py @@ -333,6 +333,18 @@ def main(args): return 2 code = sys.stdin.read() + # No encoding given? Use latin1 if output file given, + # stdin/stdout encoding otherwise. + # (This is a compromise, I'm not too happy with it...) + if 'encoding' not in O_opts and 'outencoding' not in O_opts: + if outfn: + # encoding pass-through + fmter.encoding = 'latin1' + else: + # use terminal encoding + lexer.encoding = sys.stdin.encoding + fmter.encoding = sys.stdout.encoding + # ... and do it! try: # process filters diff --git a/pygments/formatters/terminal256.py b/pygments/formatters/terminal256.py index 9c4ef617..cf313d83 100644 --- a/pygments/formatters/terminal256.py +++ b/pygments/formatters/terminal256.py @@ -187,13 +187,10 @@ class Terminal256Formatter(Formatter): if not enc and hasattr(outfile, "encoding") and \ hasattr(outfile, "isatty") and outfile.isatty(): enc = outfile.encoding - if enc: - encode = lambda value: value.encode(enc) - else: - encode = lambda value: value for ttype, value in tokensource: - value = encode(value) + if enc: + value = value.encode(enc) not_found = True while ttype and not_found: diff --git a/pygments/lexer.py b/pygments/lexer.py index c53dad0f..be1e07db 100644 --- a/pygments/lexer.py +++ b/pygments/lexer.py @@ -56,9 +56,9 @@ class Lexer(object): ``encoding`` If given, must be an encoding name. This encoding will be used to convert the input string to Unicode, if it is not already a Unicode - string. The default is to use latin1 (default: 'latin1'). - Can also be 'guess' to use a simple UTF-8 / Latin1 detection, or - 'chardet' to use the chardet library, if it is installed. + string (default: ``'latin1'``). + Can also be ``'guess'`` to use a simple UTF-8 / Latin1 detection, or + ``'chardet'`` to use the chardet library, if it is installed. """ #: Name of the lexer |