diff options
-rw-r--r-- | coverage/execfile.py | 3 | ||||
-rw-r--r-- | coverage/parser.py | 16 | ||||
-rw-r--r-- | coverage/phystokens.py | 42 | ||||
-rw-r--r-- | coverage/python.py | 16 | ||||
-rw-r--r-- | tests/test_parser.py | 5 | ||||
-rw-r--r-- | tests/test_phystokens.py | 41 |
6 files changed, 93 insertions, 30 deletions
diff --git a/coverage/execfile.py b/coverage/execfile.py index 2d856897..942bfd57 100644 --- a/coverage/execfile.py +++ b/coverage/execfile.py @@ -8,6 +8,7 @@ import types from coverage.backward import BUILTINS from coverage.backward import PYC_MAGIC_NUMBER, imp, importlib_util_find_spec from coverage.misc import ExceptionDuringRun, NoCode, NoSource +from coverage.phystokens import compile_unicode from coverage.python import get_python_source @@ -182,7 +183,7 @@ def make_code_from_py(filename): except (IOError, NoSource): raise NoSource("No file to run: '%s'" % filename) - code = compile(source, filename, "exec") + code = compile_unicode(source, filename, "exec") return code diff --git a/coverage/parser.py b/coverage/parser.py index fc751eb2..173bdf9d 100644 --- a/coverage/parser.py +++ b/coverage/parser.py @@ -9,9 +9,9 @@ import tokenize from coverage.backward import range # pylint: disable=redefined-builtin from coverage.backward import bytes_to_ints from coverage.bytecode import ByteCodes, CodeObjects -from coverage.misc import nice_pair, expensive, join_regex +from coverage.misc import contract, nice_pair, expensive, join_regex from coverage.misc import CoverageException, NoSource, NotPython -from coverage.phystokens import generate_tokens +from coverage.phystokens import compile_unicode, generate_tokens class CodeParser(object): @@ -34,6 +34,7 @@ class CodeParser(object): class PythonParser(CodeParser): """Parse code to find executable lines, excluded lines, etc.""" + @contract(text='unicode|None') def __init__(self, text=None, filename=None, exclude=None): """ Source can be provided as `text`, the text itself, or `filename`, from @@ -53,14 +54,6 @@ class PythonParser(CodeParser): "No source for code: '%s': %s" % (self.filename, err) ) - if self.text: - assert isinstance(self.text, str) - # Scrap the BOM if it exists. - # (Used to do this, but no longer. Not sure what bad will happen - # if we don't do it.) - # if ord(self.text[0]) == 0xfeff: - # self.text = self.text[1:] - self.exclude = exclude self.show_tokens = False @@ -342,13 +335,14 @@ OP_RETURN_VALUE = _opcode('RETURN_VALUE') class ByteParser(object): """Parse byte codes to understand the structure of code.""" + @contract(text='unicode') def __init__(self, text, code=None, filename=None): self.text = text if code: self.code = code else: try: - self.code = compile(text, filename, "exec") + self.code = compile_unicode(text, filename, "exec") except SyntaxError as synerr: raise NotPython( "Couldn't parse '%s' as Python source: '%s' at line %d" % ( diff --git a/coverage/phystokens.py b/coverage/phystokens.py index ed6bd238..d21d401c 100644 --- a/coverage/phystokens.py +++ b/coverage/phystokens.py @@ -8,6 +8,7 @@ import tokenize from coverage import env from coverage.backward import iternext +from coverage.misc import contract def phys_tokens(toks): @@ -148,6 +149,8 @@ class CachedTokenizer(object): generate_tokens = CachedTokenizer().generate_tokens +COOKIE_RE = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)", flags=re.MULTILINE) + def _source_encoding_py2(source): """Determine the encoding for `source`, according to PEP 263. @@ -165,8 +168,6 @@ def _source_encoding_py2(source): # This is mostly code adapted from Py3.2's tokenize module. - cookie_re = re.compile(r"^\s*#.*coding[:=]\s*([-\w.]+)") - def _get_normal_name(orig_enc): """Imitates get_normal_name in tokenizer.c.""" # Only care about the first 12 characters. @@ -204,7 +205,7 @@ def _source_encoding_py2(source): except UnicodeDecodeError: return None - matches = cookie_re.findall(line_string) + matches = COOKIE_RE.findall(line_string) if not matches: return None encoding = _get_normal_name(matches[0]) @@ -265,3 +266,38 @@ if env.PY3: source_encoding = _source_encoding_py3 else: source_encoding = _source_encoding_py2 + + +@contract(source='unicode') +def compile_unicode(source, filename, mode): + """Just like the `compile` builtin, but works on any Unicode string. + + Python 2's compile() builtin has a stupid restriction: if the source string + is Unicode, then it may not have a encoding declaration in it. Why not? + Who knows! + + This function catches that exception, neuters the coding declaration, and + compiles it anyway. + + """ + try: + code = compile(source, filename, mode) + except SyntaxError as synerr: + if synerr.args[0] != "encoding declaration in Unicode string": + raise + source = neuter_encoding_declaration(source) + code = compile(source, filename, mode) + + return code + + +@contract(source='unicode', returns='unicode') +def neuter_encoding_declaration(source): + """Return `source`, with any encoding declaration neutered. + + This function will only ever be called on `source` that has an encoding + declaration, so some edge cases can be ignored. + + """ + source = COOKIE_RE.sub("# (deleted declaration)", source) + return source diff --git a/coverage/python.py b/coverage/python.py index 19212a5b..f335f165 100644 --- a/coverage/python.py +++ b/coverage/python.py @@ -8,12 +8,13 @@ import zipimport from coverage import env from coverage.backward import unicode_class from coverage.files import FileLocator -from coverage.misc import NoSource, join_regex +from coverage.misc import contract, NoSource, join_regex from coverage.parser import PythonParser from coverage.phystokens import source_token_lines, source_encoding from coverage.plugin import FileReporter +@contract(returns='str') def read_python_source(filename): """Read the Python source text from `filename`. @@ -30,8 +31,9 @@ def read_python_source(filename): return f.read() +@contract(returns='unicode') def get_python_source(filename): - """Return the source code, as a str.""" + """Return the source code, as unicode.""" base, ext = os.path.splitext(filename) if ext == ".py" and env.WINDOWS: exts = [".py", ".pyw"] @@ -49,12 +51,15 @@ def get_python_source(filename): source = get_zip_bytes(try_filename) if source is not None: if env.PY3: - source = source.decode(source_encoding(source)) + source = source.decode(source_encoding(source), "replace") break else: # Couldn't find source. raise NoSource("No source for code: '%s'." % filename) + if env.PY2: + source = source.decode(source_encoding(source), "replace") + # Python code should always end with a line with a newline. if source and source[-1] != '\n': source += '\n' @@ -62,6 +67,7 @@ def get_python_source(filename): return source +@contract(returns='bytes|None') def get_zip_bytes(filename): """Get data from `filename` if it is a zip file path. @@ -161,12 +167,10 @@ class PythonFileReporter(FileReporter): def exit_counts(self): return self.parser.exit_counts() + @contract(returns='unicode') def source(self): if self._source is None: self._source = get_python_source(self.filename) - if env.PY2: - encoding = source_encoding(self._source) - self._source = self._source.decode(encoding, "replace") assert isinstance(self._source, unicode_class) return self._source diff --git a/tests/test_parser.py b/tests/test_parser.py index 81916a98..9359c408 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,7 +1,10 @@ """Tests for Coverage.py's code parsing.""" import textwrap + from tests.coveragetest import CoverageTest + +from coverage import env from coverage.parser import PythonParser @@ -12,6 +15,8 @@ class PythonParserTest(CoverageTest): def parse_source(self, text): """Parse `text` as source, and return the `PythonParser` used.""" + if env.PY2: + text = text.decode("ascii") text = textwrap.dedent(text) parser = PythonParser(text=text, exclude="nocover") parser.parse_source() diff --git a/tests/test_phystokens.py b/tests/test_phystokens.py index 7edd6aa4..19f813ae 100644 --- a/tests/test_phystokens.py +++ b/tests/test_phystokens.py @@ -5,6 +5,7 @@ import re from coverage import env from coverage.phystokens import source_token_lines, source_encoding +from coverage.phystokens import neuter_encoding_declaration from tests.coveragetest import CoverageTest @@ -92,21 +93,27 @@ else: DEF_ENCODING = "ascii" +ENCODING_DECLARATION_SOURCES = [ + # Various forms from http://www.python.org/dev/peps/pep-0263/ + b"# coding=cp850\n\n", + b"#!/usr/bin/python\n# -*- coding: cp850 -*-\n", + b"#!/usr/bin/python\n# vim: set fileencoding=cp850:\n", + b"# This Python file uses this encoding: cp850\n", + b"# This file uses a different encoding:\n# coding: cp850\n", +] + class SourceEncodingTest(CoverageTest): """Tests of source_encoding() for detecting encodings.""" run_in_temp_dir = False def test_detect_source_encoding(self): - # Various forms from http://www.python.org/dev/peps/pep-0263/ - source = b"# coding=cp850\n\n" - self.assertEqual(source_encoding(source), 'cp850') - source = b"#!/usr/bin/python\n# -*- coding: utf-8 -*-\n" - self.assertEqual(source_encoding(source), 'utf-8') - source = b"#!/usr/bin/python\n# vim: set fileencoding=utf8 :\n" - self.assertEqual(source_encoding(source), 'utf8') - source = b"# This Python file uses this encoding: utf-8\n" - self.assertEqual(source_encoding(source), 'utf-8') + for source in ENCODING_DECLARATION_SOURCES: + self.assertEqual( + source_encoding(source), + 'cp850', + "Wrong encoding in %r" % source + ) def test_detect_source_encoding_not_in_comment(self): if env.PYPY and env.PY3: @@ -140,3 +147,19 @@ class SourceEncodingTest(CoverageTest): source = b"\xEF\xBB\xBF# coding: cp850\n" with self.assertRaises(SyntaxError): source_encoding(source) + + +class NeuterEncodingDeclarationTest(CoverageTest): + """Tests of phystokens.neuter_encoding_declaration().""" + + run_in_temp_dir = False + + def test_neuter_encoding_declaration(self): + for source in ENCODING_DECLARATION_SOURCES: + neutered = neuter_encoding_declaration(source.decode("ascii")) + neutered = neutered.encode("ascii") + self.assertEqual( + source_encoding(neutered), + DEF_ENCODING, + "Wrong encoding in %r" % neutered + ) |