diff options
author | Benjamin Peterson <benjamin@python.org> | 2008-12-12 01:25:05 +0000 |
---|---|---|
committer | Benjamin Peterson <benjamin@python.org> | 2008-12-12 01:25:05 +0000 |
commit | 6faa7b7c23929f85671557900513d49522c4af4d (patch) | |
tree | 1fb6999059dffee7e02588e2bf4911686f299f15 /Lib/tokenize.py | |
parent | a941dfba8ac5bf24e186d5e7414dc786e87fa1c5 (diff) | |
download | cpython-6faa7b7c23929f85671557900513d49522c4af4d.tar.gz |
raise a SyntaxError in detect_encoding() when a codec lookup fails like the builtin parser #4021
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r-- | Lib/tokenize.py | 33 |
1 files changed, 20 insertions, 13 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py index ec5a79a645..16c4f3f029 100644 --- a/Lib/tokenize.py +++ b/Lib/tokenize.py @@ -26,7 +26,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, ' import re, string, sys from token import * -from codecs import lookup +from codecs import lookup, BOM_UTF8 from itertools import chain, repeat cookie_re = re.compile("coding[:=]\s*([-\w.]+)") @@ -251,11 +251,11 @@ def detect_encoding(readline): It detects the encoding from the presence of a utf-8 bom or an encoding cookie as specified in pep-0263. If both a bom and a cookie are present, - but disagree, a SyntaxError will be raised. + but disagree, a SyntaxError will be raised. If the encoding cookie is an + invalid charset, raise a SyntaxError. If no encoding is specified, then the default of 'utf-8' will be returned. """ - utf8_bom = b'\xef\xbb\xbf' bom_found = False encoding = None def read_or_stop(): @@ -268,18 +268,25 @@ def detect_encoding(readline): try: line_string = line.decode('ascii') except UnicodeDecodeError: - pass - else: - matches = cookie_re.findall(line_string) - if matches: - encoding = matches[0] - if bom_found and lookup(encoding).name != 'utf-8': - # This behaviour mimics the Python interpreter - raise SyntaxError('encoding problem: utf-8') - return encoding + return None + + matches = cookie_re.findall(line_string) + if not matches: + return None + encoding = matches[0] + try: + codec = lookup(encoding) + except LookupError: + # This behaviour mimics the Python interpreter + raise SyntaxError("unknown encoding: " + encoding) + + if bom_found and codec.name != 'utf-8': + # This behaviour mimics the Python interpreter + raise SyntaxError('encoding problem: utf-8') + return encoding first = read_or_stop() - if first.startswith(utf8_bom): + if first.startswith(BOM_UTF8): bom_found = True first = first[3:] if not first: |