4 files changed, 229 insertions, 86 deletions
diff --git a/coverage/html.py b/coverage/html.py
index 23fedd62..37c754bd 100644
--- a/coverage/html.py
+++ b/coverage/html.py
@@ -1,9 +1,9 @@
 """HTML reporting for Coverage."""
 
-import keyword, os, re, token, tokenize, shutil
+import os, re, shutil
 
 from coverage import __url__, __version__           # pylint: disable-msg=W0611
-from coverage.backward import StringIO              # pylint: disable-msg=W0622
+from coverage.phystokens import source_token_lines
 from coverage.report import Reporter
 from coverage.templite import Templite
 
@@ -20,33 +20,6 @@ def data(fname):
     return open(data_filename(fname)).read()
     
 
-def phys_tokens(toks):
-    """Return all physical tokens, even line continuations.
-    
-    tokenize.generate_tokens() doesn't return a token for the backslash that
-    continues lines.  This wrapper provides those tokens so that we can
-    re-create a faithful representation of the original source.
-    
-    Returns the same values as generate_tokens()
-    
-    """
-    last_line = None
-    last_lineno = -1
-    for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
-        if last_lineno != elineno:
-            if last_line and last_line[-2:] == "\\\n":
-                if ttype != token.STRING:
-                    ccol = len(last_line.split("\n")[-2]) - 1
-                    yield (
-                        99999, "\\\n",
-                        (slineno, ccol), (slineno, ccol+2),
-                        last_line
-                        )
-            last_line = ltext
-        yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
-        last_lineno = elineno
-
-
 class HtmlReporter(Reporter):
     """HTML reporting."""
     
@@ -88,7 +61,6 @@ class HtmlReporter(Reporter):
         """Generate an HTML file for one source file."""
         
         source = cu.source_file().read().expandtabs(4)
-        source_lines = source.split("\n")
 
         nums = analysis.numbers        
 
@@ -102,63 +74,41 @@ class HtmlReporter(Reporter):
         c_mis = " mis"
         c_par = " par"
 
-        ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]
         lines = []
-        line = []
-        lineno = 1
-        col = 0
-        tokgen = tokenize.generate_tokens(StringIO(source).readline)
-        for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
-            mark_start = True
-            for part in re.split('(\n)', ttext):
-                if part == '\n':
-
-                    line_class = ""
-                    annotate = ""
-                    if lineno in analysis.statements:
-                        line_class += " stm"
-                    if lineno in analysis.excluded:
-                        line_class += c_exc
-                    elif lineno in analysis.missing:
-                        line_class += c_mis
-                    elif self.arcs and lineno in missing_branch_arcs:
-                        line_class += c_par
-                        n_par += 1
-                        annotate = " ".join(map(str, missing_branch_arcs[lineno]))
-                    elif lineno in analysis.statements:
-                        line_class += c_run
-                        
-                    lineinfo = {
-                        'html': "".join(line),
-                        'number': lineno,
-                        'class': line_class.strip() or "pln",
-                        'annotate': annotate,
-                    }
-                    lines.append(lineinfo)
-                    
-                    line = []
-                    lineno += 1
-                    col = 0
-                    mark_end = False
-                elif part == '':
-                    mark_end = False
-                elif ttype in ws_tokens:
-                    mark_end = False
+        
+        for lineno, line in enumerate(source_token_lines(source)):
+            lineno += 1     # 1-based line numbers.
+            # Figure out how to mark this line.
+            line_class = ""
+            annotate = ""
+            if lineno in analysis.statements:
+                line_class += " stm"
+            if lineno in analysis.excluded:
+                line_class += c_exc
+            elif lineno in analysis.missing:
+                line_class += c_mis
+            elif self.arcs and lineno in missing_branch_arcs:
+                line_class += c_par
+                n_par += 1
+                annotate = " ".join(map(str, missing_branch_arcs[lineno]))
+            elif lineno in analysis.statements:
+                line_class += c_run
+            
+            # Build the HTML for the line
+            html = ""
+            for tok_type, tok_text in line:
+                if tok_type == "ws":
+                    html += escape(tok_text)
                 else:
-                    if mark_start and scol > col:
-                        line.append(escape(" " * (scol - col)))
-                        mark_start = False
-                    tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
-                    if ttype == token.NAME and keyword.iskeyword(ttext):
-                        tok_class = "key"
-                    tok_html = escape(part) or '&nbsp;'
-                    line.append(
-                        "<span class='%s'>%s</span>" % (tok_class, tok_html)
-                        )
-                    mark_end = True
-                scol = 0
-            if mark_end:
-                col = ecol
+                    tok_html = escape(tok_text) or '&nbsp;'
+                    html += "<span class='%s'>%s</span>" % (tok_type, tok_html)
+
+            lines.append({
+                'html': html,
+                'number': lineno,
+                'class': line_class.strip() or "pln",
+                'annotate': annotate,
+            })
 
         # Write the HTML page for this file.
         html_filename = cu.flat_rootname() + ".html"
@@ -184,7 +134,6 @@ class HtmlReporter(Reporter):
         arcs = self.arcs
 
         totals = sum([f['nums'] for f in files])
-        total_par = sum([f['par'] for f in files])
 
         fhtml = open(os.path.join(self.directory, "index.html"), "w")
         fhtml.write(index_tmpl.render(locals()))
diff --git a/coverage/phystokens.py b/coverage/phystokens.py
new file mode 100644
index 00000000..7eebb8ad
--- /dev/null
+++ b/coverage/phystokens.py
@@ -0,0 +1,101 @@
+"""Better tokenizing for coverage.py."""
+
+import keyword, re, token, tokenize
+from coverage.backward import StringIO              # pylint: disable-msg=W0622
+
+def phys_tokens(toks):
+    """Return all physical tokens, even line continuations.
+    
+    tokenize.generate_tokens() doesn't return a token for the backslash that
+    continues lines.  This wrapper provides those tokens so that we can
+    re-create a faithful representation of the original source.
+    
+    Returns the same values as generate_tokens()
+    
+    """
+    last_line = None
+    last_lineno = -1
+    for ttype, ttext, (slineno, scol), (elineno, ecol), ltext in toks:
+        if last_lineno != elineno:
+            if last_line and last_line[-2:] == "\\\n":
+                # We are at the beginning of a new line, and the last line
+                # ended with a backslash.  We probably have to inject a
+                # backslash token into the stream. Unfortunately, there's more
+                # to figure out.  This code::
+                #
+                #   usage = """\
+                #   HEY THERE
+                #   """
+                #
+                # triggers this condition, but the token text is::
+                #
+                #   '"""\\\nHEY THERE\n"""'
+                #
+                # so we need to figure out if the backslash is already in the
+                # string token or not.
+                inject_backslash = True
+                if ttype == token.STRING:
+                    if "\n" in ttext and ttext.split('\n', 1)[0][-1] == '\\':
+                        # It's a multiline string and the first line ends with
+                        # a backslash, so we don't need to inject another.
+                        inject_backslash = False
+                if inject_backslash:
+                    # Figure out what column the backslash is in.
+                    ccol = len(last_line.split("\n")[-2]) - 1
+                    # Yield the token, with a fake token type.
+                    yield (
+                        99999, "\\\n",
+                        (slineno, ccol), (slineno, ccol+2),
+                        last_line
+                        )
+            last_line = ltext
+        yield ttype, ttext, (slineno, scol), (elineno, ecol), ltext
+        last_lineno = elineno
+
+
+def source_token_lines(source):
+    """Generate a series of lines, one for each line in `source`.
+    
+    Each line is a list of pairs, each pair is a token::
+    
+        [('key', 'def'), ('ws', ' '), ('nam', 'hello'), ('op', '('), ... ]
+
+    Each pair has a token class, and the token text.
+    
+    If you concatenate all the token texts, and then join them with newlines,
+    you should have your original `source` back, with two differences:
+    trailing whitespace is not preserved, and a final line with no newline
+    is indistinguishable from a final line with a newline.
+
+    """
+    ws_tokens = [token.INDENT, token.DEDENT, token.NEWLINE, tokenize.NL]
+    line = []
+    col = 0
+    tokgen = tokenize.generate_tokens(StringIO(source).readline)
+    for ttype, ttext, (_, scol), (_, ecol), _ in phys_tokens(tokgen):
+        mark_start = True
+        for part in re.split('(\n)', ttext):
+            if part == '\n':
+                yield line
+                line = []
+                col = 0
+                mark_end = False
+            elif part == '':
+                mark_end = False
+            elif ttype in ws_tokens:
+                mark_end = False
+            else:
+                if mark_start and scol > col:
+                    line.append(("ws", " " * (scol - col)))
+                    mark_start = False
+                tok_class = tokenize.tok_name.get(ttype, 'xx').lower()[:3]
+                if ttype == token.NAME and keyword.iskeyword(ttext):
+                    tok_class = "key"
+                line.append((tok_class, part))
+                mark_end = True
+            scol = 0
+        if mark_end:
+            col = ecol
+
+    if line:
+        yield line
diff --git a/test/stress_phystoken.txt b/test/stress_phystoken.txt
new file mode 100644
index 00000000..bd6a453a
--- /dev/null
+++ b/test/stress_phystoken.txt
@@ -0,0 +1,35 @@
+# Here's some random Python so that test_tokenize_myself will have some
+# stressful stuff to try.  This file is .txt instead of .py so pylint won't
+# complain about it.
+
+first_back = """\
+hey there!
+"""
+
+other_back = """
+hey \
+there
+"""
+
+lots_of_back = """\
+hey \
+there
+"""
+fake_back = """\ 
+ouch
+"""
+
+class C(object):
+        def there():
+                this = 5 + \
+                    7
+                that = \
+                    "a continued line"
+
+cont1 = "one line of text" + \
+    "another line of text"
+
+def hello():
+        print("Hello world!")
+
+hello()
diff --git a/test/test_phystokens.py b/test/test_phystokens.py
new file mode 100644
index 00000000..03f2a929
--- /dev/null
+++ b/test/test_phystokens.py
@@ -0,0 +1,58 @@
+"""Tests for Coverage.py's improved tokenizer."""
+
+import os, re, sys
+
+sys.path.insert(0, os.path.split(__file__)[0]) # Force relative import for Py3k
+from coveragetest import CoverageTest
+
+from coverage.phystokens import source_token_lines
+
+
+SIMPLE = """\
+# yay!
+def foo():
+  say('two = %d' % 2)
+"""
+
+HERE = os.path.split(__file__)[0]
+
+class PhysTokensTest(CoverageTest):
+    """Tests for Coverage.py's improver tokenizer."""
+
+    def check_tokenization(self, source):
+        """Tokenize `source`, then put it back together, should be the same."""
+        tokenized = ""
+        for line in source_token_lines(source):
+            text = "".join([t for _,t in line])
+            tokenized += text + "\n"
+        source = re.sub("(?m)[ \t]+$", "", source)
+        tokenized = re.sub("(?m)[ \t]+$", "", tokenized)
+        #if source != tokenized:
+        #    open(r"c:\foo\0.py", "w").write(source)
+        #    open(r"c:\foo\1.py", "w").write(tokenized)
+        self.assertEqual(source, tokenized)
+
+    def check_file_tokenization(self, fname):
+        """Use the contents of `fname` for `check_tokenization`."""
+        self.check_tokenization(open(fname).read())
+
+    def test_simple(self):
+        self.assertEqual(list(source_token_lines(SIMPLE)),
+            [
+                [('com', "# yay!")],
+                [('key', 'def'), ('ws', ' '), ('nam', 'foo'), ('op', '('),
+                            ('op', ')'), ('op', ':')],
+                [('ws', '  '), ('nam', 'say'), ('op', '('),
+                            ('str', "'two = %d'"), ('ws', ' '), ('op', '%'),
+                            ('ws', ' '), ('num', '2'), ('op', ')')]
+            ]
+            )
+        self.check_tokenization(SIMPLE)
+
+    def test_tokenize_real_file(self):
+        real_file = os.path.join(HERE, "test_coverage.py")
+        self.check_file_tokenization(real_file)
+
+    def test_stress(self):
+        stress = os.path.join(HERE, "stress_phystoken.txt")
+        self.check_file_tokenization(stress)