Properly handle filenames with non-ASCII characters. #432

author: Ned Batchelder <ned@nedbatchelder.com> 2015-10-25 17:40:09 -0400
committer: Ned Batchelder <ned@nedbatchelder.com> 2015-10-25 17:40:09 -0400
commit: b810cbc0d06df0a04e3380166215b8ad2f40524c (patch)
tree: b0ea1499815b84301fa7991e37de128265526e81
parent: 15100248c12b85a00278371fea60f07718a9d499 (diff)
download: python-coveragepy-git-b810cbc0d06df0a04e3380166215b8ad2f40524c.tar.gz
11 files changed, 193 insertions, 33 deletions
diff --git a/CHANGES.rst b/CHANGES.rst
index 035c3b72..a61ef2d4 100644
--- a/CHANGES.rst
+++ b/CHANGES.rst
@@ -11,7 +11,11 @@ Version 4.0.2
 
 - More work on supporting unusually encoded source. Fixed `issue 431`_.
 
+- Files or directories with non-ASCII characters are now handled properly,
+  fixing `issue 432`_.
+
 .. _issue 431: https://bitbucket.org/ned/coveragepy/issues/431/couldnt-parse-python-file-with-cp1252
+.. _issue 432: https://bitbucket.org/ned/coveragepy/issues/432/path-with-unicode-characters-various
 
 
 Version 4.0.1 --- 13 October 2015
diff --git a/coverage/files.py b/coverage/files.py
index 0b5651cb..855d8157 100644
--- a/coverage/files.py
+++ b/coverage/files.py
@@ -13,12 +13,9 @@ import sys
 
 from coverage import env
 from coverage.backward import unicode_class
-from coverage.misc import CoverageException, join_regex, isolate_module
+from coverage.misc import contract, CoverageException, join_regex, isolate_module
 
 
-RELATIVE_DIR = None
-CANONICAL_FILENAME_CACHE = {}
-
 os = isolate_module(os)
 
 
@@ -33,10 +30,13 @@ def set_relative_directory():
     # avoid duplicating work.
     CANONICAL_FILENAME_CACHE = {}
 
+
 def relative_directory():
     """Return the directory that `relative_filename` is relative to."""
     return RELATIVE_DIR
 
+
+@contract(returns='unicode')
 def relative_filename(filename):
     """Return the relative form of `filename`.
 
@@ -47,8 +47,10 @@ def relative_filename(filename):
     fnorm = os.path.normcase(filename)
     if fnorm.startswith(RELATIVE_DIR):
         filename = filename[len(RELATIVE_DIR):]
-    return filename
+    return unicode_filename(filename)
 
+
+@contract(returns='unicode')
 def canonical_filename(filename):
     """Return a canonical file name for `filename`.
 
@@ -65,6 +67,8 @@ def canonical_filename(filename):
                     filename = f
                     break
         cf = abs_file(filename)
+        if env.PY2 and isinstance(cf, str):
+            cf = cf.decode(sys.getfilesystemencoding())
         CANONICAL_FILENAME_CACHE[filename] = cf
     return CANONICAL_FILENAME_CACHE[filename]
 
@@ -126,14 +130,35 @@ else:
         return filename
 
 
+if env.PY2:
+    @contract(returns='unicode')
+    def unicode_filename(filename):
+        """Return a Unicode version of `filename`."""
+        if isinstance(filename, str):
+            filename = filename.decode(sys.getfilesystemencoding())
+        return filename
+else:
+    @contract(filename='unicode', returns='unicode')
+    def unicode_filename(filename):
+        """Return a Unicode version of `filename`."""
+        return filename
+
+
+@contract(returns='unicode')
 def abs_file(filename):
     """Return the absolute normalized form of `filename`."""
     path = os.path.expandvars(os.path.expanduser(filename))
     path = os.path.abspath(os.path.realpath(path))
     path = actual_path(path)
+    path = unicode_filename(path)
     return path
 
 
+RELATIVE_DIR = None
+CANONICAL_FILENAME_CACHE = None
+set_relative_directory()
+
+
 def isabs_anywhere(filename):
     """Is `filename` an absolute path on any OS?"""
     return ntpath.isabs(filename) or posixpath.isabs(filename)
diff --git a/coverage/parser.py b/coverage/parser.py
index 882c972b..a5e96237 100644
--- a/coverage/parser.py
+++ b/coverage/parser.py
@@ -211,7 +211,7 @@ class PythonParser(object):
             else:
                 lineno = err.args[1][0]     # TokenError
             raise NotPython(
-                "Couldn't parse '%s' as Python source: '%s' at line %d" % (
+                u"Couldn't parse '%s' as Python source: '%s' at line %d" % (
                     self.filename, err.args[0], lineno
                 )
             )
@@ -338,7 +338,7 @@ class ByteParser(object):
                 self.code = compile_unicode(text, filename, "exec")
             except SyntaxError as synerr:
                 raise NotPython(
-                    "Couldn't parse '%s' as Python source: '%s' at line %d" % (
+                    u"Couldn't parse '%s' as Python source: '%s' at line %d" % (
                         filename, synerr.msg, synerr.lineno
                     )
                 )
diff --git a/coverage/phystokens.py b/coverage/phystokens.py
index 203d41f2..f5bd0bc9 100644
--- a/coverage/phystokens.py
+++ b/coverage/phystokens.py
@@ -6,6 +6,7 @@
 import codecs
 import keyword
 import re
+import sys
 import token
 import tokenize
 
@@ -281,6 +282,8 @@ def compile_unicode(source, filename, mode):
 
     """
     source = neuter_encoding_declaration(source)
+    if env.PY2 and isinstance(filename, unicode):
+        filename = filename.encode(sys.getfilesystemencoding(), "replace")
     code = compile(source, filename, mode)
     return code
 
diff --git a/coverage/python.py b/coverage/python.py
index 71b50f0c..fe32150a 100644
--- a/coverage/python.py
+++ b/coverage/python.py
@@ -4,6 +4,7 @@
 """Python source expertise for coverage.py"""
 
 import os.path
+import sys
 import zipimport
 
 from coverage import env, files
@@ -95,6 +96,9 @@ class PythonFileReporter(FileReporter):
         else:
             filename = morf
 
+        if env.PY2 and isinstance(filename, str):
+            filename = filename.decode(sys.getfilesystemencoding())
+
         # .pyc files should always refer to a .py instead.
         if filename.endswith(('.pyc', '.pyo')):
             filename = filename[:-1]
@@ -106,6 +110,8 @@ class PythonFileReporter(FileReporter):
         if hasattr(morf, '__name__'):
             name = morf.__name__
             name = name.replace(".", os.sep) + ".py"
+            if isinstance(name, bytes):
+                name = name.decode(sys.getfilesystemencoding())
         else:
             name = files.relative_filename(filename)
         self.relname = name
@@ -115,6 +121,7 @@ class PythonFileReporter(FileReporter):
         self._statements = None
         self._excluded = None
 
+    @contract(returns='unicode')
     def relative_filename(self):
         return self.relname
 
diff --git a/coverage/summary.py b/coverage/summary.py
index 4dcaa735..f797e306 100644
--- a/coverage/summary.py
+++ b/coverage/summary.py
@@ -5,6 +5,7 @@
 
 import sys
 
+from coverage import env
 from coverage.report import Reporter
 from coverage.results import Numbers
 from coverage.misc import NotPython, CoverageException
@@ -20,38 +21,45 @@ class SummaryReporter(Reporter):
     def report(self, morfs, outfile=None):
         """Writes a report summarizing coverage statistics per module.
 
-        `outfile` is a file object to write the summary to.
+        `outfile` is a file object to write the summary to. It must be opened
+        for native strings (bytes on Python 2, Unicode on Python 3).
 
         """
         self.find_file_reporters(morfs)
 
         # Prepare the formatting strings
         max_name = max([len(fr.relative_filename()) for fr in self.file_reporters] + [5])
-        fmt_name = "%%- %ds  " % max_name
-        fmt_err = "%s   %s: %s\n"
-        fmt_skip_covered = "\n%s file%s skipped due to complete coverage.\n"
+        fmt_name = u"%%- %ds  " % max_name
+        fmt_err = u"%s   %s: %s\n"
+        fmt_skip_covered = u"\n%s file%s skipped due to complete coverage.\n"
 
-        header = (fmt_name % "Name") + " Stmts   Miss"
-        fmt_coverage = fmt_name + "%6d %6d"
+        header = (fmt_name % "Name") + u" Stmts   Miss"
+        fmt_coverage = fmt_name + u"%6d %6d"
         if self.branches:
-            header += " Branch BrPart"
-            fmt_coverage += " %6d %6d"
+            header += u" Branch BrPart"
+            fmt_coverage += u" %6d %6d"
         width100 = Numbers.pc_str_width()
-        header += "%*s" % (width100+4, "Cover")
-        fmt_coverage += "%%%ds%%%%" % (width100+3,)
+        header += u"%*s" % (width100+4, "Cover")
+        fmt_coverage += u"%%%ds%%%%" % (width100+3,)
         if self.config.show_missing:
-            header += "   Missing"
-            fmt_coverage += "   %s"
-        rule = "-" * len(header) + "\n"
-        header += "\n"
-        fmt_coverage += "\n"
+            header += u"   Missing"
+            fmt_coverage += u"   %s"
+        rule = u"-" * len(header) + u"\n"
+        header += u"\n"
+        fmt_coverage += u"\n"
 
-        if not outfile:
+        if outfile is None:
             outfile = sys.stdout
 
+        if env.PY2:
+            encoding = getattr(outfile, "encoding", None) or sys.getfilesystemencoding()
+            writeout = lambda u: outfile.write(u.encode(encoding))
+        else:
+            writeout = outfile.write
+
         # Write the header
-        outfile.write(header)
-        outfile.write(rule)
+        writeout(header)
+        writeout(rule)
 
         total = Numbers()
         skipped_count = 0
@@ -83,7 +91,7 @@ class SummaryReporter(Reporter):
                                 missing_fmtd += ", "
                             missing_fmtd += branches_fmtd
                     args += (missing_fmtd,)
-                outfile.write(fmt_coverage % args)
+                writeout(fmt_coverage % args)
             except Exception:
                 report_it = not self.config.ignore_errors
                 if report_it:
@@ -93,22 +101,22 @@ class SummaryReporter(Reporter):
                     if typ is NotPython and not fr.should_be_python():
                         report_it = False
                 if report_it:
-                    outfile.write(fmt_err % (fr.relative_filename(), typ.__name__, msg))
+                    writeout(fmt_err % (fr.relative_filename(), typ.__name__, msg))
 
         if total.n_files > 1:
-            outfile.write(rule)
+            writeout(rule)
             args = ("TOTAL", total.n_statements, total.n_missing)
             if self.branches:
                 args += (total.n_branches, total.n_partial_branches)
             args += (total.pc_covered_str,)
             if self.config.show_missing:
                 args += ("",)
-            outfile.write(fmt_coverage % args)
+            writeout(fmt_coverage % args)
 
         if not total.n_files and not skipped_count:
             raise CoverageException("No data to report.")
 
         if self.config.skip_covered and skipped_count:
-            outfile.write(fmt_skip_covered % (skipped_count, 's' if skipped_count > 1 else ''))
+            writeout(fmt_skip_covered % (skipped_count, 's' if skipped_count > 1 else ''))
 
         return total.n_statements and total.pc_covered
diff --git a/coverage/xmlreport.py b/coverage/xmlreport.py
index b8f8a9e4..d7c2f44a 100644
--- a/coverage/xmlreport.py
+++ b/coverage/xmlreport.py
@@ -8,6 +8,7 @@ import sys
 import time
 import xml.dom.minidom
 
+from coverage import env
 from coverage import __url__, __version__, files
 from coverage.misc import isolate_module
 from coverage.report import Reporter
@@ -116,7 +117,10 @@ class XmlReporter(Reporter):
         xcoverage.setAttribute("branch-rate", branch_rate)
 
         # Use the DOM to write the output file.
-        outfile.write(self.xml_out.toprettyxml())
+        out = self.xml_out.toprettyxml()
+        if env.PY2:
+            out = out.encode("utf8")
+        outfile.write(out)
 
         # Return the total percentage.
         denom = lnum_tot + bnum_tot
diff --git a/tests/helpers.py b/tests/helpers.py
index aa094bc1..2723ea59 100644
--- a/tests/helpers.py
+++ b/tests/helpers.py
@@ -4,6 +4,7 @@
 """Helpers for coverage.py tests."""
 
 import subprocess
+import sys
 
 
 def run_command(cmd):
@@ -12,8 +13,12 @@ def run_command(cmd):
     Returns the exit status code and the combined stdout and stderr.
 
     """
+    # In some strange cases (PyPy3 in a virtualenv!?) the stdout encoding of
+    # the subprocess is set incorrectly to ascii.  Use an environment variable
+    # to force the encoding to be the same as ours.
     proc = subprocess.Popen(
-        cmd, shell=True,
+        "PYTHONIOENCODING=%s %s" % (sys.__stdout__.encoding, cmd),
+        shell=True,
         stdin=subprocess.PIPE, stdout=subprocess.PIPE,
         stderr=subprocess.STDOUT
         )
diff --git a/tests/test_files.py b/tests/test_files.py
index e3d33285..e7353235 100644
--- a/tests/test_files.py
+++ b/tests/test_files.py
@@ -27,7 +27,7 @@ class FilesTest(CoverageTest):
     def test_simple(self):
         self.make_file("hello.py")
         files.set_relative_directory()
-        self.assertEqual(files.relative_filename("hello.py"), "hello.py")
+        self.assertEqual(files.relative_filename(u"hello.py"), u"hello.py")
         a = self.abs_path("hello.py")
         self.assertNotEqual(a, "hello.py")
         self.assertEqual(files.relative_filename(a), "hello.py")
diff --git a/tests/test_process.py b/tests/test_process.py
index 4902f7c0..8f69877f 100644
--- a/tests/test_process.py
+++ b/tests/test_process.py
@@ -884,6 +884,82 @@ class FailUnderEmptyFilesTest(CoverageTest):
         self.assertEqual(st, 2)
 
 
+class UnicodeFilePathsTest(CoverageTest):
+    """Tests of using non-ascii characters in the names of files."""
+
+    def test_snowman_dot_py(self):
+        # Make a file with a non-ascii character in the filename.
+        self.make_file(u"snowman☃.py", "print('snowman')")
+        out = self.run_command(u"coverage run snowman☃.py")
+        self.assertEqual(out, "snowman\n")
+
+        # The HTML report uses ascii-encoded HTML entities.
+        out = self.run_command("coverage html")
+        self.assertEqual(out, "")
+        self.assert_exists("htmlcov/snowman☃_py.html")
+        with open("htmlcov/index.html") as indexf:
+            index = indexf.read()
+        self.assertIn('<a href="snowman&#9731;_py.html">snowman&#9731;.py</a>', index)
+
+        # The XML report is always UTF8-encoded.
+        out = self.run_command("coverage xml")
+        self.assertEqual(out, "")
+        with open("coverage.xml", "rb") as xmlf:
+            xml = xmlf.read()
+        self.assertIn(u' filename="snowman☃.py"'.encode('utf8'), xml)
+        self.assertIn(u' name="snowman☃.py"'.encode('utf8'), xml)
+
+        report_expected = (
+            u"Name          Stmts   Miss  Cover\n"
+            u"---------------------------------\n"
+            u"snowman☃.py       1      0   100%\n"
+        )
+
+        if env.PY2:
+            report_expected = report_expected.encode("utf8")
+
+        out = self.run_command("coverage report")
+        self.assertEqual(out, report_expected)
+
+    def test_snowman_directory(self):
+        # Make a file with a non-ascii character in the directory name.
+        self.make_file(u"☃/snowman.py", "print('snowman')")
+        out = self.run_command(u"coverage run ☃/snowman.py")
+        self.assertEqual(out, "snowman\n")
+
+        # The HTML report uses ascii-encoded HTML entities.
+        out = self.run_command("coverage html")
+        self.assertEqual(out, "")
+        self.assert_exists("htmlcov/☃_snowman_py.html")
+        with open("htmlcov/index.html") as indexf:
+            index = indexf.read()
+        self.assertIn('<a href="&#9731;_snowman_py.html">&#9731;/snowman.py</a>', index)
+
+        # The XML report is always UTF8-encoded.
+        out = self.run_command("coverage xml")
+        self.assertEqual(out, "")
+        with open("coverage.xml", "rb") as xmlf:
+            xml = xmlf.read()
+        self.assertIn(u' filename="☃/snowman.py"'.encode('utf8'), xml)
+        self.assertIn(u' name="snowman.py"'.encode('utf8'), xml)
+        self.assertIn(
+            u'<package branch-rate="0" complexity="0" line-rate="1" name="☃">'.encode('utf8'),
+            xml
+        )
+
+        report_expected = (
+            u"Name           Stmts   Miss  Cover\n"
+            u"----------------------------------\n"
+            u"☃/snowman.py       1      0   100%\n"
+        )
+
+        if env.PY2:
+            report_expected = report_expected.encode("utf8")
+
+        out = self.run_command("coverage report")
+        self.assertEqual(out, report_expected)
+
+
 def possible_pth_dirs():
     """Produce a sequence of directories for trying to write .pth files."""
     # First look through sys.path, and we find a .pth file, then it's a good
diff --git a/tests/test_summary.py b/tests/test_summary.py
index cf55130a..f1331fea 100644
--- a/tests/test_summary.py
+++ b/tests/test_summary.py
@@ -1,3 +1,4 @@
+# coding: utf8
 # Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
 # For details: https://bitbucket.org/ned/coveragepy/src/default/NOTICE.txt
 
@@ -404,6 +405,33 @@ class SummaryTest(CoverageTest):
             "mycode.py NotPython: Couldn't parse 'mycode.py' as Python source: 'error' at line 1"
         )
 
+    def test_snowmandotpy_not_python(self):
+        # We run a .py file with a non-ascii name, and when reporting, we can't
+        # parse it as Python.  We should get an error message in the report.
+
+        self.make_file(u"snowman☃.py", "print('snowman')")
+        self.run_command(u"coverage run snowman☃.py")
+        self.make_file(u"snowman☃.py", "This isn't python at all!")
+        report = self.report_from_command(u"coverage report snowman☃.py")
+
+        # Name     Stmts   Miss  Cover
+        # ----------------------------
+        # xxxx   NotPython: Couldn't parse '...' as Python source: 'invalid syntax' at line 1
+        # No data to report.
+
+        last = self.squeezed_lines(report)[-2]
+        # The actual file name varies run to run.
+        last = re.sub(r"parse '.*(snowman.*?\.py)", r"parse '\1", last)
+        # The actual error message varies version to version
+        last = re.sub(r": '.*' at", ": 'error' at", last)
+        expected = (
+            u"snowman☃.py NotPython: "
+            u"Couldn't parse 'snowman☃.py' as Python source: 'error' at line 1"
+        )
+        if env.PY2:
+            expected = expected.encode("utf8")
+        self.assertEqual(last, expected)
+
     def test_dotpy_not_python_ignored(self):
         # We run a .py file, and when reporting, we can't parse it as Python,
         # but we've said to ignore errors, so there's no error reported.
author	Ned Batchelder <ned@nedbatchelder.com>	2015-10-25 17:40:09 -0400
committer	Ned Batchelder <ned@nedbatchelder.com>	2015-10-25 17:40:09 -0400
commit	b810cbc0d06df0a04e3380166215b8ad2f40524c (patch)
tree	b0ea1499815b84301fa7991e37de128265526e81
parent	15100248c12b85a00278371fea60f07718a9d499 (diff)
download	python-coveragepy-git-b810cbc0d06df0a04e3380166215b8ad2f40524c.tar.gz