summaryrefslogtreecommitdiff
path: root/Lib/tokenize.py
diff options
context:
space:
mode:
authorBenjamin Peterson <benjamin@python.org>2008-12-12 01:25:05 +0000
committerBenjamin Peterson <benjamin@python.org>2008-12-12 01:25:05 +0000
commit6faa7b7c23929f85671557900513d49522c4af4d (patch)
tree1fb6999059dffee7e02588e2bf4911686f299f15 /Lib/tokenize.py
parenta941dfba8ac5bf24e186d5e7414dc786e87fa1c5 (diff)
downloadcpython-6faa7b7c23929f85671557900513d49522c4af4d.tar.gz
raise a SyntaxError in detect_encoding() when a codec lookup fails like the builtin parser #4021
Diffstat (limited to 'Lib/tokenize.py')
-rw-r--r--Lib/tokenize.py33
1 files changed, 20 insertions, 13 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index ec5a79a645..16c4f3f029 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -26,7 +26,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
import re, string, sys
from token import *
-from codecs import lookup
+from codecs import lookup, BOM_UTF8
from itertools import chain, repeat
cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
@@ -251,11 +251,11 @@ def detect_encoding(readline):
It detects the encoding from the presence of a utf-8 bom or an encoding
cookie as specified in pep-0263. If both a bom and a cookie are present,
- but disagree, a SyntaxError will be raised.
+ but disagree, a SyntaxError will be raised. If the encoding cookie is an
+ invalid charset, raise a SyntaxError.
If no encoding is specified, then the default of 'utf-8' will be returned.
"""
- utf8_bom = b'\xef\xbb\xbf'
bom_found = False
encoding = None
def read_or_stop():
@@ -268,18 +268,25 @@ def detect_encoding(readline):
try:
line_string = line.decode('ascii')
except UnicodeDecodeError:
- pass
- else:
- matches = cookie_re.findall(line_string)
- if matches:
- encoding = matches[0]
- if bom_found and lookup(encoding).name != 'utf-8':
- # This behaviour mimics the Python interpreter
- raise SyntaxError('encoding problem: utf-8')
- return encoding
+ return None
+
+ matches = cookie_re.findall(line_string)
+ if not matches:
+ return None
+ encoding = matches[0]
+ try:
+ codec = lookup(encoding)
+ except LookupError:
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError("unknown encoding: " + encoding)
+
+ if bom_found and codec.name != 'utf-8':
+ # This behaviour mimics the Python interpreter
+ raise SyntaxError('encoding problem: utf-8')
+ return encoding
first = read_or_stop()
- if first.startswith(utf8_bom):
+ if first.startswith(BOM_UTF8):
bom_found = True
first = first[3:]
if not first: