6 files changed, 93 insertions, 30 deletions
diff --git a/coverage/execfile.py b/coverage/execfile.py
index 2d856897..942bfd57 100644
--- a/coverage/execfile.py
+++ b/coverage/execfile.py
@@ -8,6 +8,7 @@ import types
 from coverage.backward import BUILTINS
 from coverage.backward import PYC_MAGIC_NUMBER, imp, importlib_util_find_spec
 from coverage.misc import ExceptionDuringRun, NoCode, NoSource
+from coverage.phystokens import compile_unicode
 from coverage.python import get_python_source
 
 
@@ -182,7 +183,7 @@ def make_code_from_py(filename):
     except (IOError, NoSource):
         raise NoSource("No file to run: '%s'" % filename)
 
-    code = compile(source, filename, "exec")
+    code = compile_unicode(source, filename, "exec")
     return code
 
 
diff --git a/coverage/parser.py b/coverage/parser.py
index fc751eb2..173bdf9d 100644
--- a/coverage/parser.py
+++ b/coverage/parser.py
@@ -9,9 +9,9 @@ import tokenize
 from coverage.backward import range    # pylint: disable=redefined-builtin
 from coverage.backward import bytes_to_ints
 from coverage.bytecode import ByteCodes, CodeObjects
-from coverage.misc import nice_pair, expensive, join_regex
+from coverage.misc import contract, nice_pair, expensive, join_regex
 from coverage.misc import CoverageException, NoSource, NotPython
-from coverage.phystokens import generate_tokens
+from coverage.phystokens import compile_unicode, generate_tokens
 
 
 class CodeParser(object):
@@ -34,6 +34,7 @@ class CodeParser(object):
 class PythonParser(CodeParser):
     """Parse code to find executable lines, excluded lines, etc."""
 
+    @contract(text='unicode|None')
     def __init__(self, text=None, filename=None, exclude=None):
         """
         Source can be provided as `text`, the text itself, or `filename`, from
@@ -53,14 +54,6 @@ class PythonParser(CodeParser):
                     "No source for code: '%s': %s" % (self.filename, err)
                     )
 
-        if self.text:
-            assert isinstance(self.text, str)
-            # Scrap the BOM if it exists.
-            # (Used to do this, but no longer.  Not sure what bad will happen
-            # if we don't do it.)
-            #   if ord(self.text[0]) == 0xfeff:
-            #       self.text = self.text[1:]
-
         self.exclude = exclude
 
         self.show_tokens = False
@@ -342,13 +335,14 @@ OP_RETURN_VALUE = _opcode('RETURN_VALUE')
 class ByteParser(object):
     """Parse byte codes to understand the structure of code."""
 
+    @contract(text='unicode')
     def __init__(self, text, code=None, filename=None):
         self.text = text
         if code:
             self.code = code
         else:
             try:
-                self.code = compile(text, filename, "exec")
+                self.code = compile_unicode(text, filename, "exec")
             except SyntaxError as synerr:
                 raise NotPython(
                     "Couldn't parse '%s' as Python source: '%s' at line %d" % (
diff --git a/coverage/phystokens.py b/coverage/phystokens.py
index ed6bd238..d21d401c 100644
--- a/coverage/phystokens.py
+++ b/coverage/phystokens.py
@@ -8,6 +8,7 @@ import tokenize
 
 from coverage import env
 from coverage.backward import iternext
+from coverage.misc import contract
 
 
 def phys_tokens(toks):
@@ -148,6 +149,8 @@ class CachedTokenizer(object):
 generate_tokens = CachedTokenizer().generate_tokens
 
 
+COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE)
+
 def _source_encoding_py2(source):
     """Determine the encoding for `source`, according to PEP 263.
 
@@ -165,8 +168,6 @@ def _source_encoding_py2(source):
 
     # This is mostly code adapted from Py3.2's tokenize module.
 
-    cookie_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)")
-
     def _get_normal_name(orig_enc):
         """Imitates get_normal_name in tokenizer.c."""
         # Only care about the first 12 characters.
@@ -204,7 +205,7 @@ def _source_encoding_py2(source):
         except UnicodeDecodeError:
             return None
 
-        matches = cookie_re.findall(line_string)
+        matches = COOKIE_RE.findall(line_string)
         if not matches:
             return None
         encoding = _get_normal_name(matches[0])
@@ -265,3 +266,38 @@ if env.PY3:
     source_encoding = _source_encoding_py3
 else:
     source_encoding = _source_encoding_py2
+
+
+@contract(source='unicode')
+def compile_unicode(source, filename, mode):
+    """Just like the `compile` builtin, but works on any Unicode string.
+
+    Python 2's compile() builtin has a stupid restriction: if the source string
+    is Unicode, then it may not have a encoding declaration in it.  Why not?
+    Who knows!
+
+    This function catches that exception, neuters the coding declaration, and
+    compiles it anyway.
+
+    """
+    try:
+        code = compile(source, filename, mode)
+    except SyntaxError as synerr:
+        if synerr.args[0] != "encoding declaration in Unicode string":
+            raise
+        source = neuter_encoding_declaration(source)
+        code = compile(source, filename, mode)
+
+    return code
+
+
+@contract(source='unicode', returns='unicode')
+def neuter_encoding_declaration(source):
+    """Return `source`, with any encoding declaration neutered.
+
+    This function will only ever be called on `source` that has an encoding
+    declaration, so some edge cases can be ignored.
+
+    """
+    source = COOKIE_RE.sub("# (deleted declaration)", source)
+    return source
diff --git a/coverage/python.py b/coverage/python.py
index 19212a5b..f335f165 100644
--- a/coverage/python.py
+++ b/coverage/python.py
@@ -8,12 +8,13 @@ import zipimport
 from coverage import env
 from coverage.backward import unicode_class
 from coverage.files import FileLocator
-from coverage.misc import NoSource, join_regex
+from coverage.misc import contract, NoSource, join_regex
 from coverage.parser import PythonParser
 from coverage.phystokens import source_token_lines, source_encoding
 from coverage.plugin import FileReporter
 
 
+@contract(returns='str')
 def read_python_source(filename):
     """Read the Python source text from `filename`.
 
@@ -30,8 +31,9 @@ def read_python_source(filename):
         return f.read()
 
 
+@contract(returns='unicode')
 def get_python_source(filename):
-    """Return the source code, as a str."""
+    """Return the source code, as unicode."""
     base, ext = os.path.splitext(filename)
     if ext == ".py" and env.WINDOWS:
         exts = [".py", ".pyw"]
@@ -49,12 +51,15 @@ def get_python_source(filename):
         source = get_zip_bytes(try_filename)
         if source is not None:
             if env.PY3:
-                source = source.decode(source_encoding(source))
+                source = source.decode(source_encoding(source), "replace")
             break
     else:
         # Couldn't find source.
         raise NoSource("No source for code: '%s'." % filename)
 
+    if env.PY2:
+        source = source.decode(source_encoding(source), "replace")
+
     # Python code should always end with a line with a newline.
     if source and source[-1] != '\n':
         source += '\n'
@@ -62,6 +67,7 @@ def get_python_source(filename):
     return source
 
 
+@contract(returns='bytes|None')
 def get_zip_bytes(filename):
     """Get data from `filename` if it is a zip file path.
 
@@ -161,12 +167,10 @@ class PythonFileReporter(FileReporter):
     def exit_counts(self):
         return self.parser.exit_counts()
 
+    @contract(returns='unicode')
     def source(self):
         if self._source is None:
             self._source = get_python_source(self.filename)
-            if env.PY2:
-                encoding = source_encoding(self._source)
-                self._source = self._source.decode(encoding, "replace")
             assert isinstance(self._source, unicode_class)
         return self._source
 
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 81916a98..9359c408 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -1,7 +1,10 @@
 """Tests for Coverage.py's code parsing."""
 
 import textwrap
+
 from tests.coveragetest import CoverageTest
+
+from coverage import env
 from coverage.parser import PythonParser
 
 
@@ -12,6 +15,8 @@ class PythonParserTest(CoverageTest):
 
     def parse_source(self, text):
         """Parse `text` as source, and return the `PythonParser` used."""
+        if env.PY2:
+            text = text.decode("ascii")
         text = textwrap.dedent(text)
         parser = PythonParser(text=text, exclude="nocover")
         parser.parse_source()
diff --git a/tests/test_phystokens.py b/tests/test_phystokens.py
index 7edd6aa4..19f813ae 100644
--- a/tests/test_phystokens.py
+++ b/tests/test_phystokens.py
@@ -5,6 +5,7 @@ import re
 
 from coverage import env
 from coverage.phystokens import source_token_lines, source_encoding
+from coverage.phystokens import neuter_encoding_declaration
 
 from tests.coveragetest import CoverageTest
 
@@ -92,21 +93,27 @@ else:
     DEF_ENCODING = "ascii"
 
 
+ENCODING_DECLARATION_SOURCES = [
+    # Various forms from http://www.python.org/dev/peps/pep-0263/
+    b"# coding=cp850\n\n",
+    b"#!/usr/bin/python\n# -*- coding: cp850 -*-\n",
+    b"#!/usr/bin/python\n# vim: set fileencoding=cp850:\n",
+    b"# This Python file uses this encoding: cp850\n",
+    b"# This file uses a different encoding:\n# coding: cp850\n",
+]
+
 class SourceEncodingTest(CoverageTest):
     """Tests of source_encoding() for detecting encodings."""
 
     run_in_temp_dir = False
 
     def test_detect_source_encoding(self):
-        # Various forms from http://www.python.org/dev/peps/pep-0263/
-        source = b"# coding=cp850\n\n"
-        self.assertEqual(source_encoding(source), 'cp850')
-        source = b"#!/usr/bin/python\n# -*- coding: utf-8 -*-\n"
-        self.assertEqual(source_encoding(source), 'utf-8')
-        source = b"#!/usr/bin/python\n# vim: set fileencoding=utf8 :\n"
-        self.assertEqual(source_encoding(source), 'utf8')
-        source = b"# This Python file uses this encoding: utf-8\n"
-        self.assertEqual(source_encoding(source), 'utf-8')
+        for source in ENCODING_DECLARATION_SOURCES:
+            self.assertEqual(
+                source_encoding(source),
+                'cp850',
+                "Wrong encoding in %r" % source
+            )
 
     def test_detect_source_encoding_not_in_comment(self):
         if env.PYPY and env.PY3:
@@ -140,3 +147,19 @@ class SourceEncodingTest(CoverageTest):
         source = b"\xEF\xBB\xBF# coding: cp850\n"
         with self.assertRaises(SyntaxError):
             source_encoding(source)
+
+
+class NeuterEncodingDeclarationTest(CoverageTest):
+    """Tests of phystokens.neuter_encoding_declaration()."""
+
+    run_in_temp_dir = False
+
+    def test_neuter_encoding_declaration(self):
+        for source in ENCODING_DECLARATION_SOURCES:
+            neutered = neuter_encoding_declaration(source.decode("ascii"))
+            neutered = neutered.encode("ascii")
+            self.assertEqual(
+                source_encoding(neutered),
+                DEF_ENCODING,
+                "Wrong encoding in %r" % neutered
+            )