From 6055667e8090fc09e55b96ae49d5ac6a37706ede Mon Sep 17 00:00:00 2001 From: Ned Batchelder Date: Fri, 12 Dec 2014 08:51:19 -0500 Subject: Source is always Unicode in HTML code. More refactoring to come. --- coverage/backward.py | 19 +++++++++++++- coverage/codeunit.py | 44 ++++++++++++------------------- coverage/files.py | 67 +++++++++++++++++++++++++++++++----------------- coverage/html.py | 25 +++++------------- coverage/misc.py | 2 ++ coverage/parser.py | 23 +++++++++-------- coverage/phystokens.py | 3 +++ coverage/templite.py | 6 ++++- tests/test_html.py | 2 +- tests/test_misc.py | 7 +++++ tests/test_phystokens.py | 2 ++ tests/test_templite.py | 11 ++++++++ 12 files changed, 130 insertions(+), 81 deletions(-) diff --git a/coverage/backward.py b/coverage/backward.py index 50b21f3b..dfc169d4 100644 --- a/coverage/backward.py +++ b/coverage/backward.py @@ -20,6 +20,12 @@ try: except NameError: string_class = str +# What's a Unicode string called? +try: + unicode_class = unicode +except NameError: + unicode_class = str + # Where do pickles come from? try: import cPickle as pickle @@ -66,6 +72,11 @@ if sys.version_info >= (3, 0): """Convert bytes `b` to a string.""" return b.decode('utf8') + def unicode_literal(s): + """Make a plain string literal into unicode.""" + # In Python 3, string literals already are unicode. + return s + def binary_bytes(byte_values): """Produce a byte string with the ints from `byte_values`.""" return bytes(byte_values) @@ -88,6 +99,11 @@ else: """Convert bytes `b` to a string (no-op in 2.x).""" return b + def unicode_literal(s): + """Make a plain string literal into unicode.""" + # In Python 2, s is a byte string. + return s.decode('utf8') + def binary_bytes(byte_values): """Produce a byte string with the ints from `byte_values`.""" return "".join(chr(b) for b in byte_values) @@ -117,13 +133,14 @@ try: except ImportError: importlib = None -# we only want to use importlib if it has everything we need. +# We only want to use importlib if it has everything we need. try: importlib_util_find_spec = importlib.util.find_spec except Exception: import imp importlib_util_find_spec = None +# What is the .pyc magic number for this version of Python? try: PYC_MAGIC_NUMBER = importlib.util.MAGIC_NUMBER except AttributeError: diff --git a/coverage/codeunit.py b/coverage/codeunit.py index 207383e0..f34967e5 100644 --- a/coverage/codeunit.py +++ b/coverage/codeunit.py @@ -1,10 +1,12 @@ """Code unit (module) handling for Coverage.""" import os +import sys -from coverage.backward import open_python_source, string_class +from coverage.backward import string_class, unicode_class +from coverage.files import get_python_source, get_zip_data from coverage.misc import CoverageException, NoSource -from coverage.parser import CodeParser, PythonParser +from coverage.parser import PythonParser from coverage.phystokens import source_token_lines, source_encoding @@ -85,7 +87,7 @@ class CodeUnit(object): self._source = None def __repr__(self): - return "" % (self.name, self.filename) + return "<{self.__class__.__name__} name={self.name!r} filename={self.filename!r}>".format(self=self) def _adjust_filename(self, f): # TODO: This shouldn't be in the base class, right? @@ -124,29 +126,12 @@ class CodeUnit(object): return root.replace('\\', '_').replace('/', '_').replace('.', '_') def source(self): - if self._source is None: - self._source = self.get_source() - return self._source - - def get_source(self): - """Return the source code, as a string.""" - if os.path.exists(self.filename): - # A regular text file: open it. - with open_python_source(self.filename) as f: - return f.read() - - # Maybe it's in a zip file? - source = self.file_locator.get_zip_data(self.filename) - if source is not None: - return source - - # Couldn't find source. - raise CoverageException( - "No source for code '%s'." % self.filename - ) + """Return the source for the code, a Unicode string.""" + return unicode_class("???") def source_token_lines(self): """Return the 'tokenized' text for the code.""" + # A generic implementation, each line is one "txt" token. for line in self.source().splitlines(): yield [('txt', line)] @@ -174,6 +159,14 @@ class PythonCodeUnit(CodeUnit): fname = fname[:-9] + ".py" return fname + def source(self): + if self._source is None: + self._source = get_python_source(self.filename) + if sys.version_info < (3, 0): + encoding = source_encoding(self._source) + self._source = self._source.decode(encoding, "replace") + return self._source + def get_parser(self, exclude=None): actual_filename, source = self._find_source(self.filename) return PythonParser( @@ -213,7 +206,7 @@ class PythonCodeUnit(CodeUnit): try_filename = base + try_ext if os.path.exists(try_filename): return try_filename, None - source = self.file_locator.get_zip_data(try_filename) + source = get_zip_data(try_filename) if source: return try_filename, source raise NoSource("No source for code: '%s'" % filename) @@ -240,6 +233,3 @@ class PythonCodeUnit(CodeUnit): def source_token_lines(self): return source_token_lines(self.source()) - - def source_encoding(self): - return source_encoding(self.source()) diff --git a/coverage/files.py b/coverage/files.py index c2a5ec72..1400b6eb 100644 --- a/coverage/files.py +++ b/coverage/files.py @@ -1,10 +1,12 @@ """File wrangling.""" -from coverage.backward import to_string -from coverage.misc import CoverageException, join_regex import fnmatch, os, os.path, re, sys import ntpath, posixpath +from coverage.backward import to_string, open_python_source +from coverage.misc import CoverageException, join_regex + + class FileLocator(object): """Understand how filenames work.""" @@ -47,29 +49,48 @@ class FileLocator(object): self.canonical_filename_cache[filename] = cf return self.canonical_filename_cache[filename] - def get_zip_data(self, filename): - """Get data from `filename` if it is a zip file path. - Returns the string data read from the zip file, or None if no zip file - could be found or `filename` isn't in it. The data returned will be - an empty string if the file is empty. +def get_python_source(filename): + """Return the source code, as a string.""" + if os.path.exists(filename): + # A regular text file: open it. + with open_python_source(filename) as f: + return f.read() - """ - import zipimport - markers = ['.zip'+os.sep, '.egg'+os.sep] - for marker in markers: - if marker in filename: - parts = filename.split(marker) - try: - zi = zipimport.zipimporter(parts[0]+marker[:-1]) - except zipimport.ZipImportError: - continue - try: - data = zi.get_data(parts[1]) - except IOError: - continue - return to_string(data) - return None + # Maybe it's in a zip file? + source = get_zip_data(filename) + if source is not None: + return source + + # Couldn't find source. + raise CoverageException( + "No source for code: '%s'." % filename + ) + + +def get_zip_data(filename): + """Get data from `filename` if it is a zip file path. + + Returns the string data read from the zip file, or None if no zip file + could be found or `filename` isn't in it. The data returned will be + an empty string if the file is empty. + + """ + import zipimport + markers = ['.zip'+os.sep, '.egg'+os.sep] + for marker in markers: + if marker in filename: + parts = filename.split(marker) + try: + zi = zipimport.zipimporter(parts[0]+marker[:-1]) + except zipimport.ZipImportError: + continue + try: + data = zi.get_data(parts[1]) + except IOError: + continue + return to_string(data) + return None if sys.platform == 'win32': diff --git a/coverage/html.py b/coverage/html.py index a4b46a23..677e5e83 100644 --- a/coverage/html.py +++ b/coverage/html.py @@ -1,5 +1,7 @@ """HTML reporting for Coverage.""" +from __future__ import unicode_literals + import json, os, re, shutil, sys import coverage @@ -65,10 +67,13 @@ class HtmlReporter(Reporter): def __init__(self, cov, config): super(HtmlReporter, self).__init__(cov, config) self.directory = None + title = self.config.html_title + if sys.version_info < (3, 0): + title = title.decode("utf8") self.template_globals = { 'escape': escape, 'pair': pair, - 'title': self.config.html_title, + 'title': title, '__url__': coverage.__url__, '__version__': coverage.__version__, } @@ -154,7 +159,7 @@ class HtmlReporter(Reporter): # Find out if the file on disk is already correct. flat_rootname = cu.flat_rootname() - this_hash = self.file_hash(source, cu) + this_hash = self.file_hash(source.encode('utf-8'), cu) that_hash = self.status.file_hash(flat_rootname) if this_hash == that_hash: # Nothing has changed to require the file to be reported again. @@ -163,15 +168,6 @@ class HtmlReporter(Reporter): self.status.set_file_hash(flat_rootname, this_hash) - # If need be, determine the encoding of the source file. We use it - # later to properly write the HTML. - if sys.version_info < (3, 0): - encoding = cu.source_encoding() - # Some UTF8 files have the dreaded UTF8 BOM. If so, junk it. - if encoding.startswith("utf-8") and source[:3] == "\xef\xbb\xbf": - source = source[3:] - encoding = "utf-8" - # Get the numbers for this file. nums = analysis.numbers @@ -239,11 +235,6 @@ class HtmlReporter(Reporter): 'cu': cu, 'nums': nums, 'lines': lines, })) - if sys.version_info < (3, 0): - # In theory, all the characters in the source can be decoded, but - # strange things happen, so use 'replace' to keep errors at bay. - html = html.decode(encoding, 'replace') - html_filename = flat_rootname + ".html" html_path = os.path.join(self.directory, html_filename) self.write_html(html_path, html) @@ -272,8 +263,6 @@ class HtmlReporter(Reporter): 'totals': self.totals, }) - if sys.version_info < (3, 0): - html = html.decode("utf-8") self.write_html( os.path.join(self.directory, "index.html"), html diff --git a/coverage/misc.py b/coverage/misc.py index f0e043b9..924199ef 100644 --- a/coverage/misc.py +++ b/coverage/misc.py @@ -109,6 +109,8 @@ class Hasher(object): self.md5.update(to_bytes(str(type(v)))) if isinstance(v, string_class): self.md5.update(to_bytes(v)) + elif isinstance(v, bytes): + self.md5.update(v) elif v is None: pass elif isinstance(v, (int, float)): diff --git a/coverage/parser.py b/coverage/parser.py index ef2ee5b8..97cc01bb 100644 --- a/coverage/parser.py +++ b/coverage/parser.py @@ -2,10 +2,10 @@ import collections, dis, re, token, tokenize -from coverage.backward import StringIO from coverage.backward import range # pylint: disable=redefined-builtin -from coverage.backward import bytes_to_ints, open_python_source +from coverage.backward import bytes_to_ints from coverage.bytecode import ByteCodes, CodeObjects +from coverage.files import get_python_source from coverage.misc import nice_pair, expensive, join_regex from coverage.misc import CoverageException, NoSource, NotPython @@ -42,8 +42,7 @@ class PythonParser(CodeParser): self.text = text if not self.text: try: - with open_python_source(self.filename) as sourcef: - self.text = sourcef.read() + self.text = get_python_source(self.filename) except IOError as err: raise NoSource( "No source for code: '%s': %s" % (self.filename, err) @@ -345,8 +344,7 @@ class ByteParser(object): else: if not text: assert filename, "If no code or text, need a filename" - with open_python_source(filename) as sourcef: - text = sourcef.read() + text = get_python_source(filename) self.text = text try: @@ -692,11 +690,16 @@ class CachedTokenizer(object): def generate_tokens(self, text): """A stand-in for `tokenize.generate_tokens`.""" - if text != self.last_text: + # Check the type first so we don't compare bytes to unicode and get + # warnings. + if type(text) != type(self.last_text) or text != self.last_text: self.last_text = text - self.last_tokens = list( - tokenize.generate_tokens(StringIO(text).readline) - ) + line_iter = iter(text.splitlines(True)) + try: + readline = line_iter.next + except AttributeError: + readline = line_iter.__next__ + self.last_tokens = list(tokenize.generate_tokens(readline)) return self.last_tokens # Create our generate_tokens cache as a callable replacement function. diff --git a/coverage/phystokens.py b/coverage/phystokens.py index fe77c7de..3fd1165c 100644 --- a/coverage/phystokens.py +++ b/coverage/phystokens.py @@ -1,6 +1,7 @@ """Better tokenizing for coverage.py.""" import codecs, keyword, re, sys, token, tokenize + from coverage.parser import generate_tokens @@ -75,6 +76,7 @@ def source_token_lines(source): is indistinguishable from a final line with a newline. """ + ws_tokens = set([token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]) line = [] col = 0 @@ -108,6 +110,7 @@ def source_token_lines(source): if line: yield line + def source_encoding(source): """Determine the encoding for `source` (a string), according to PEP 263. diff --git a/coverage/templite.py b/coverage/templite.py index 5877c058..c2e8981f 100644 --- a/coverage/templite.py +++ b/coverage/templite.py @@ -3,6 +3,7 @@ # Coincidentally named the same as http://code.activestate.com/recipes/496702/ import re +import sys class TempliteSyntaxError(ValueError): @@ -116,7 +117,10 @@ class Templite(object): code.add_line("result = []") code.add_line("append_result = result.append") code.add_line("extend_result = result.extend") - code.add_line("to_str = str") + if sys.version_info < (3, 0): + code.add_line("to_str = unicode") + else: + code.add_line("to_str = str") buffered = [] def flush_output(): diff --git a/tests/test_html.py b/tests/test_html.py index 9448447b..cf143cc1 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -383,6 +383,6 @@ class HtmlStaticFileTest(CoverageTest): self.make_file("main.py", "print(17)") cov = coverage.coverage() self.start_import_stop(cov, "main") - msg = "Couldn't find static file '.*'" + msg = "Couldn't find static file u?'.*'" with self.assertRaisesRegex(CoverageException, msg): cov.html_report() diff --git a/tests/test_misc.py b/tests/test_misc.py index 7b7d51d0..d9b0c4e6 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -22,6 +22,13 @@ class HasherTest(CoverageTest): self.assertNotEqual(h1.hexdigest(), h2.hexdigest()) self.assertEqual(h1.hexdigest(), h3.hexdigest()) + def test_bytes_hashing(self): + h1 = Hasher() + h1.update(b"Hello, world!") + h2 = Hasher() + h2.update(b"Goodbye!") + self.assertNotEqual(h1.hexdigest(), h2.hexdigest()) + def test_dict_hashing(self): h1 = Hasher() h1.update({'a': 17, 'b': 23}) diff --git a/tests/test_phystokens.py b/tests/test_phystokens.py index ccd5682a..10e0225f 100644 --- a/tests/test_phystokens.py +++ b/tests/test_phystokens.py @@ -1,5 +1,7 @@ """Tests for Coverage.py's improved tokenizer.""" +#from __future__ import unicode_literals + import os, re, sys from tests.coveragetest import CoverageTest from coverage.phystokens import source_token_lines, source_encoding diff --git a/tests/test_templite.py b/tests/test_templite.py index b3e21e70..56033ec1 100644 --- a/tests/test_templite.py +++ b/tests/test_templite.py @@ -1,7 +1,11 @@ +# -*- coding: utf8 -*- """Tests for coverage.templite.""" import re + +from coverage.backward import unicode_literal from coverage.templite import Templite, TempliteSyntaxError + from tests.coveragetest import CoverageTest # pylint: disable=unused-variable @@ -232,6 +236,13 @@ class TempliteTest(CoverageTest): "@a0b0c0a1b1c1a2b2c2!" ) + def test_non_ascii(self): + self.try_render( + unicode_literal("{{where}} ollǝɥ"), + { 'where': unicode_literal('ǝɹǝɥʇ') }, + unicode_literal("ǝɹǝɥʇ ollǝɥ") + ) + def test_exception_during_evaluation(self): # TypeError: Couldn't evaluate {{ foo.bar.baz }}: # 'NoneType' object is unsubscriptable -- cgit v1.2.1