raise a SyntaxError in detect_encoding() when a codec lookup fails like the builtin parser #4021

author: Benjamin Peterson <benjamin@python.org> 2008-12-12 01:25:05 +0000
committer: Benjamin Peterson <benjamin@python.org> 2008-12-12 01:25:05 +0000
commit: 6faa7b7c23929f85671557900513d49522c4af4d (patch)
tree: 1fb6999059dffee7e02588e2bf4911686f299f15 /Lib/tokenize.py
parent: a941dfba8ac5bf24e186d5e7414dc786e87fa1c5 (diff)
download: cpython-6faa7b7c23929f85671557900513d49522c4af4d.tar.gz
1 files changed, 20 insertions, 13 deletions
diff --git a/Lib/tokenize.py b/Lib/tokenize.py
index ec5a79a645..16c4f3f029 100644
--- a/Lib/tokenize.py
+++ b/Lib/tokenize.py
@@ -26,7 +26,7 @@ __credits__ = ('GvR, ESR, Tim Peters, Thomas Wouters, Fred Drake, '
 
 import re, string, sys
 from token import *
-from codecs import lookup
+from codecs import lookup, BOM_UTF8
 from itertools import chain, repeat
 cookie_re = re.compile("coding[:=]\s*([-\w.]+)")
 
@@ -251,11 +251,11 @@ def detect_encoding(readline):
 
     It detects the encoding from the presence of a utf-8 bom or an encoding
     cookie as specified in pep-0263. If both a bom and a cookie are present,
-    but disagree, a SyntaxError will be raised.
+    but disagree, a SyntaxError will be raised. If the encoding cookie is an
+    invalid charset, raise a SyntaxError.
 
     If no encoding is specified, then the default of 'utf-8' will be returned.
     """
-    utf8_bom = b'\xef\xbb\xbf'
     bom_found = False
     encoding = None
     def read_or_stop():
@@ -268,18 +268,25 @@ def detect_encoding(readline):
         try:
             line_string = line.decode('ascii')
         except UnicodeDecodeError:
-            pass
-        else:
-            matches = cookie_re.findall(line_string)
-            if matches:
-                encoding = matches[0]
-                if bom_found and lookup(encoding).name != 'utf-8':
-                    # This behaviour mimics the Python interpreter
-                    raise SyntaxError('encoding problem: utf-8')
-                return encoding
+            return None
+
+        matches = cookie_re.findall(line_string)
+        if not matches:
+            return None
+        encoding = matches[0]
+        try:
+            codec = lookup(encoding)
+        except LookupError:
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError("unknown encoding: " + encoding)
+
+        if bom_found and codec.name != 'utf-8':
+            # This behaviour mimics the Python interpreter
+            raise SyntaxError('encoding problem: utf-8')
+        return encoding
 
     first = read_or_stop()
-    if first.startswith(utf8_bom):
+    if first.startswith(BOM_UTF8):
         bom_found = True
         first = first[3:]
     if not first:
author	Benjamin Peterson <benjamin@python.org>	2008-12-12 01:25:05 +0000
committer	Benjamin Peterson <benjamin@python.org>	2008-12-12 01:25:05 +0000
commit	6faa7b7c23929f85671557900513d49522c4af4d (patch)
tree	1fb6999059dffee7e02588e2bf4911686f299f15 /Lib/tokenize.py
parent	a941dfba8ac5bf24e186d5e7414dc786e87fa1c5 (diff)
download	cpython-6faa7b7c23929f85671557900513d49522c4af4d.tar.gz