From 6055667e8090fc09e55b96ae49d5ac6a37706ede Mon Sep 17 00:00:00 2001
From: Ned Batchelder <ned@nedbatchelder.com>
Date: Fri, 12 Dec 2014 08:51:19 -0500
Subject: Source is always Unicode in HTML code. More refactoring to come.

---
 coverage/backward.py     | 19 +++++++++++++-
 coverage/codeunit.py     | 44 ++++++++++++-------------------
 coverage/files.py        | 67 +++++++++++++++++++++++++++++++-----------------
 coverage/html.py         | 25 +++++-------------
 coverage/misc.py         |  2 ++
 coverage/parser.py       | 23 +++++++++--------
 coverage/phystokens.py   |  3 +++
 coverage/templite.py     |  6 ++++-
 tests/test_html.py       |  2 +-
 tests/test_misc.py       |  7 +++++
 tests/test_phystokens.py |  2 ++
 tests/test_templite.py   | 11 ++++++++
 12 files changed, 130 insertions(+), 81 deletions(-)
diff --git a/coverage/backward.py b/coverage/backward.py
index 50b21f3b..dfc169d4 100644
--- a/coverage/backward.py
+++ b/coverage/backward.py
@@ -20,6 +20,12 @@ try:
 except NameError:
     string_class = str
 
+# What's a Unicode string called?
+try:
+    unicode_class = unicode
+except NameError:
+    unicode_class = str
+
 # Where do pickles come from?
 try:
     import cPickle as pickle
@@ -66,6 +72,11 @@ if sys.version_info >= (3, 0):
         """Convert bytes `b` to a string."""
         return b.decode('utf8')
 
+    def unicode_literal(s):
+        """Make a plain string literal into unicode."""
+        # In Python 3, string literals already are unicode.
+        return s
+
     def binary_bytes(byte_values):
         """Produce a byte string with the ints from `byte_values`."""
         return bytes(byte_values)
@@ -88,6 +99,11 @@ else:
         """Convert bytes `b` to a string (no-op in 2.x)."""
         return b
 
+    def unicode_literal(s):
+        """Make a plain string literal into unicode."""
+        # In Python 2, s is a byte string.
+        return s.decode('utf8')
+
     def binary_bytes(byte_values):
         """Produce a byte string with the ints from `byte_values`."""
         return "".join(chr(b) for b in byte_values)
@@ -117,13 +133,14 @@ try:
 except ImportError:
     importlib = None
 
-# we only want to use importlib if it has everything we need.
+# We only want to use importlib if it has everything we need.
 try:
     importlib_util_find_spec = importlib.util.find_spec
 except Exception:
     import imp
     importlib_util_find_spec = None
 
+# What is the .pyc magic number for this version of Python?
 try:
     PYC_MAGIC_NUMBER = importlib.util.MAGIC_NUMBER
 except AttributeError:
diff --git a/coverage/codeunit.py b/coverage/codeunit.py
index 207383e0..f34967e5 100644
--- a/coverage/codeunit.py
+++ b/coverage/codeunit.py
@@ -1,10 +1,12 @@
 """Code unit (module) handling for Coverage."""
 
 import os
+import sys
 
-from coverage.backward import open_python_source, string_class
+from coverage.backward import string_class, unicode_class
+from coverage.files import get_python_source, get_zip_data
 from coverage.misc import CoverageException, NoSource
-from coverage.parser import CodeParser, PythonParser
+from coverage.parser import PythonParser
 from coverage.phystokens import source_token_lines, source_encoding
 
 
@@ -85,7 +87,7 @@ class CodeUnit(object):
         self._source = None
 
     def __repr__(self):
-        return "<CodeUnit name=%r filename=%r>" % (self.name, self.filename)
+        return "<{self.__class__.__name__} name={self.name!r} filename={self.filename!r}>".format(self=self)
 
     def _adjust_filename(self, f):
         # TODO: This shouldn't be in the base class, right?
@@ -124,29 +126,12 @@ class CodeUnit(object):
             return root.replace('\\', '_').replace('/', '_').replace('.', '_')
 
     def source(self):
-        if self._source is None:
-            self._source = self.get_source()
-        return self._source
-
-    def get_source(self):
-        """Return the source code, as a string."""
-        if os.path.exists(self.filename):
-            # A regular text file: open it.
-            with open_python_source(self.filename) as f:
-                return f.read()
-
-        # Maybe it's in a zip file?
-        source = self.file_locator.get_zip_data(self.filename)
-        if source is not None:
-            return source
-
-        # Couldn't find source.
-        raise CoverageException(
-            "No source for code '%s'." % self.filename
-            )
+        """Return the source for the code, a Unicode string."""
+        return unicode_class("???")
 
     def source_token_lines(self):
         """Return the 'tokenized' text for the code."""
+        # A generic implementation, each line is one "txt" token.
         for line in self.source().splitlines():
             yield [('txt', line)]
 
@@ -174,6 +159,14 @@ class PythonCodeUnit(CodeUnit):
             fname = fname[:-9] + ".py"
         return fname
 
+    def source(self):
+        if self._source is None:
+            self._source = get_python_source(self.filename)
+            if sys.version_info < (3, 0):
+                encoding = source_encoding(self._source)
+                self._source = self._source.decode(encoding, "replace")
+        return self._source
+
     def get_parser(self, exclude=None):
         actual_filename, source = self._find_source(self.filename)
         return PythonParser(
@@ -213,7 +206,7 @@ class PythonCodeUnit(CodeUnit):
             try_filename = base + try_ext
             if os.path.exists(try_filename):
                 return try_filename, None
-            source = self.file_locator.get_zip_data(try_filename)
+            source = get_zip_data(try_filename)
             if source:
                 return try_filename, source
         raise NoSource("No source for code: '%s'" % filename)
@@ -240,6 +233,3 @@ class PythonCodeUnit(CodeUnit):
 
     def source_token_lines(self):
         return source_token_lines(self.source())
-
-    def source_encoding(self):
-        return source_encoding(self.source())
diff --git a/coverage/files.py b/coverage/files.py
index c2a5ec72..1400b6eb 100644
--- a/coverage/files.py
+++ b/coverage/files.py
@@ -1,10 +1,12 @@
 """File wrangling."""
 
-from coverage.backward import to_string
-from coverage.misc import CoverageException, join_regex
 import fnmatch, os, os.path, re, sys
 import ntpath, posixpath
 
+from coverage.backward import to_string, open_python_source
+from coverage.misc import CoverageException, join_regex
+
+
 class FileLocator(object):
     """Understand how filenames work."""
 
@@ -47,29 +49,48 @@ class FileLocator(object):
             self.canonical_filename_cache[filename] = cf
         return self.canonical_filename_cache[filename]
 
-    def get_zip_data(self, filename):
-        """Get data from `filename` if it is a zip file path.
 
-        Returns the string data read from the zip file, or None if no zip file
-        could be found or `filename` isn't in it.  The data returned will be
-        an empty string if the file is empty.
+def get_python_source(filename):
+    """Return the source code, as a string."""
+    if os.path.exists(filename):
+        # A regular text file: open it.
+        with open_python_source(filename) as f:
+            return f.read()
 
-        """
-        import zipimport
-        markers = ['.zip'+os.sep, '.egg'+os.sep]
-        for marker in markers:
-            if marker in filename:
-                parts = filename.split(marker)
-                try:
-                    zi = zipimport.zipimporter(parts[0]+marker[:-1])
-                except zipimport.ZipImportError:
-                    continue
-                try:
-                    data = zi.get_data(parts[1])
-                except IOError:
-                    continue
-                return to_string(data)
-        return None
+    # Maybe it's in a zip file?
+    source = get_zip_data(filename)
+    if source is not None:
+        return source
+
+    # Couldn't find source.
+    raise CoverageException(
+        "No source for code: '%s'." % filename
+        )
+
+
+def get_zip_data(filename):
+    """Get data from `filename` if it is a zip file path.
+
+    Returns the string data read from the zip file, or None if no zip file
+    could be found or `filename` isn't in it.  The data returned will be
+    an empty string if the file is empty.
+
+    """
+    import zipimport
+    markers = ['.zip'+os.sep, '.egg'+os.sep]
+    for marker in markers:
+        if marker in filename:
+            parts = filename.split(marker)
+            try:
+                zi = zipimport.zipimporter(parts[0]+marker[:-1])
+            except zipimport.ZipImportError:
+                continue
+            try:
+                data = zi.get_data(parts[1])
+            except IOError:
+                continue
+            return to_string(data)
+    return None
 
 
 if sys.platform == 'win32':
diff --git a/coverage/html.py b/coverage/html.py
index a4b46a23..677e5e83 100644
--- a/coverage/html.py
+++ b/coverage/html.py
@@ -1,5 +1,7 @@
 """HTML reporting for Coverage."""
 
+from __future__ import unicode_literals
+
 import json, os, re, shutil, sys
 
 import coverage
@@ -65,10 +67,13 @@ class HtmlReporter(Reporter):
     def __init__(self, cov, config):
         super(HtmlReporter, self).__init__(cov, config)
         self.directory = None
+        title = self.config.html_title
+        if sys.version_info < (3, 0):
+            title = title.decode("utf8")
         self.template_globals = {
             'escape': escape,
             'pair': pair,
-            'title': self.config.html_title,
+            'title': title,
             '__url__': coverage.__url__,
             '__version__': coverage.__version__,
             }
@@ -154,7 +159,7 @@ class HtmlReporter(Reporter):
 
         # Find out if the file on disk is already correct.
         flat_rootname = cu.flat_rootname()
-        this_hash = self.file_hash(source, cu)
+        this_hash = self.file_hash(source.encode('utf-8'), cu)
         that_hash = self.status.file_hash(flat_rootname)
         if this_hash == that_hash:
             # Nothing has changed to require the file to be reported again.
@@ -163,15 +168,6 @@ class HtmlReporter(Reporter):
 
         self.status.set_file_hash(flat_rootname, this_hash)
 
-        # If need be, determine the encoding of the source file. We use it
-        # later to properly write the HTML.
-        if sys.version_info < (3, 0):
-            encoding = cu.source_encoding()
-            # Some UTF8 files have the dreaded UTF8 BOM. If so, junk it.
-            if encoding.startswith("utf-8") and source[:3] == "\xef\xbb\xbf":
-                source = source[3:]
-                encoding = "utf-8"
-
         # Get the numbers for this file.
         nums = analysis.numbers
 
@@ -239,11 +235,6 @@ class HtmlReporter(Reporter):
             'cu': cu, 'nums': nums, 'lines': lines,
         }))
 
-        if sys.version_info < (3, 0):
-            # In theory, all the characters in the source can be decoded, but
-            # strange things happen, so use 'replace' to keep errors at bay.
-            html = html.decode(encoding, 'replace')
-
         html_filename = flat_rootname + ".html"
         html_path = os.path.join(self.directory, html_filename)
         self.write_html(html_path, html)
@@ -272,8 +263,6 @@ class HtmlReporter(Reporter):
             'totals': self.totals,
         })
 
-        if sys.version_info < (3, 0):
-            html = html.decode("utf-8")
         self.write_html(
             os.path.join(self.directory, "index.html"),
             html
diff --git a/coverage/misc.py b/coverage/misc.py
index f0e043b9..924199ef 100644
--- a/coverage/misc.py
+++ b/coverage/misc.py
@@ -109,6 +109,8 @@ class Hasher(object):
         self.md5.update(to_bytes(str(type(v))))
         if isinstance(v, string_class):
             self.md5.update(to_bytes(v))
+        elif isinstance(v, bytes):
+            self.md5.update(v)
         elif v is None:
             pass
         elif isinstance(v, (int, float)):
diff --git a/coverage/parser.py b/coverage/parser.py
index ef2ee5b8..97cc01bb 100644
--- a/coverage/parser.py
+++ b/coverage/parser.py
@@ -2,10 +2,10 @@
 
 import collections, dis, re, token, tokenize
 
-from coverage.backward import StringIO
 from coverage.backward import range    # pylint: disable=redefined-builtin
-from coverage.backward import bytes_to_ints, open_python_source
+from coverage.backward import bytes_to_ints
 from coverage.bytecode import ByteCodes, CodeObjects
+from coverage.files import get_python_source
 from coverage.misc import nice_pair, expensive, join_regex
 from coverage.misc import CoverageException, NoSource, NotPython
 
@@ -42,8 +42,7 @@ class PythonParser(CodeParser):
         self.text = text
         if not self.text:
             try:
-                with open_python_source(self.filename) as sourcef:
-                    self.text = sourcef.read()
+                self.text = get_python_source(self.filename)
             except IOError as err:
                 raise NoSource(
                     "No source for code: '%s': %s" % (self.filename, err)
@@ -345,8 +344,7 @@ class ByteParser(object):
         else:
             if not text:
                 assert filename, "If no code or text, need a filename"
-                with open_python_source(filename) as sourcef:
-                    text = sourcef.read()
+                text = get_python_source(filename)
             self.text = text
 
             try:
@@ -692,11 +690,16 @@ class CachedTokenizer(object):
 
     def generate_tokens(self, text):
         """A stand-in for `tokenize.generate_tokens`."""
-        if text != self.last_text:
+        # Check the type first so we don't compare bytes to unicode and get
+        # warnings.
+        if type(text) != type(self.last_text) or text != self.last_text:
             self.last_text = text
-            self.last_tokens = list(
-                tokenize.generate_tokens(StringIO(text).readline)
-            )
+            line_iter = iter(text.splitlines(True))
+            try:
+                readline = line_iter.next
+            except AttributeError:
+                readline = line_iter.__next__
+            self.last_tokens = list(tokenize.generate_tokens(readline))
         return self.last_tokens
 
 # Create our generate_tokens cache as a callable replacement function.
diff --git a/coverage/phystokens.py b/coverage/phystokens.py
index fe77c7de..3fd1165c 100644
--- a/coverage/phystokens.py
+++ b/coverage/phystokens.py
@@ -1,6 +1,7 @@
 """Better tokenizing for coverage.py."""
 
 import codecs, keyword, re, sys, token, tokenize
+
 from coverage.parser import generate_tokens
 
 
@@ -75,6 +76,7 @@ def source_token_lines(source):
     is indistinguishable from a final line with a newline.
 
     """
+
     ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL])
     line = []
     col = 0
@@ -108,6 +110,7 @@ def source_token_lines(source):
     if line:
         yield line
 
+
 def source_encoding(source):
     """Determine the encoding for `source` (a string), according to PEP 263.
 
diff --git a/coverage/templite.py b/coverage/templite.py
index 5877c058..c2e8981f 100644
--- a/coverage/templite.py
+++ b/coverage/templite.py
@@ -3,6 +3,7 @@
 # Coincidentally named the same as http://code.activestate.com/recipes/496702/
 
 import re
+import sys
 
 
 class TempliteSyntaxError(ValueError):
@@ -116,7 +117,10 @@ class Templite(object):
         code.add_line("result = []")
         code.add_line("append_result = result.append")
         code.add_line("extend_result = result.extend")
-        code.add_line("to_str = str")
+        if sys.version_info < (3, 0):
+            code.add_line("to_str = unicode")
+        else:
+            code.add_line("to_str = str")
 
         buffered = []
         def flush_output():
diff --git a/tests/test_html.py b/tests/test_html.py
index 9448447b..cf143cc1 100644
--- a/tests/test_html.py
+++ b/tests/test_html.py
@@ -383,6 +383,6 @@ class HtmlStaticFileTest(CoverageTest):
         self.make_file("main.py", "print(17)")
         cov = coverage.coverage()
         self.start_import_stop(cov, "main")
-        msg = "Couldn't find static file '.*'"
+        msg = "Couldn't find static file u?'.*'"
         with self.assertRaisesRegex(CoverageException, msg):
             cov.html_report()
diff --git a/tests/test_misc.py b/tests/test_misc.py
index 7b7d51d0..d9b0c4e6 100644
--- a/tests/test_misc.py
+++ b/tests/test_misc.py
@@ -22,6 +22,13 @@ class HasherTest(CoverageTest):
         self.assertNotEqual(h1.hexdigest(), h2.hexdigest())
         self.assertEqual(h1.hexdigest(), h3.hexdigest())
 
+    def test_bytes_hashing(self):
+        h1 = Hasher()
+        h1.update(b"Hello, world!")
+        h2 = Hasher()
+        h2.update(b"Goodbye!")
+        self.assertNotEqual(h1.hexdigest(), h2.hexdigest())
+
     def test_dict_hashing(self):
         h1 = Hasher()
         h1.update({'a': 17, 'b': 23})
diff --git a/tests/test_phystokens.py b/tests/test_phystokens.py
index ccd5682a..10e0225f 100644
--- a/tests/test_phystokens.py
+++ b/tests/test_phystokens.py
@@ -1,5 +1,7 @@
 """Tests for Coverage.py's improved tokenizer."""
 
+#from __future__ import unicode_literals
+
 import os, re, sys
 from tests.coveragetest import CoverageTest
 from coverage.phystokens import source_token_lines, source_encoding
diff --git a/tests/test_templite.py b/tests/test_templite.py
index b3e21e70..56033ec1 100644
--- a/tests/test_templite.py
+++ b/tests/test_templite.py
@@ -1,7 +1,11 @@
+# -*- coding: utf8 -*-
 """Tests for coverage.templite."""
 
 import re
+
+from coverage.backward import unicode_literal
 from coverage.templite import Templite, TempliteSyntaxError
+
 from tests.coveragetest import CoverageTest
 
 # pylint: disable=unused-variable
@@ -232,6 +236,13 @@ class TempliteTest(CoverageTest):
             "@a0b0c0a1b1c1a2b2c2!"
             )
 
+    def test_non_ascii(self):
+        self.try_render(
+            unicode_literal("{{where}} ollǝɥ"),
+            { 'where': unicode_literal('ǝɹǝɥʇ') },
+            unicode_literal("ǝɹǝɥʇ ollǝɥ")
+        )
+
     def test_exception_during_evaluation(self):
         # TypeError: Couldn't evaluate {{ foo.bar.baz }}:
         # 'NoneType' object is unsubscriptable
-- 
cgit v1.2.1