doc/check_copied_from.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113

# Licensed under the Apache License: http://www.apache.org/licenses/LICENSE-2.0
# For details: https://github.com/nedbat/coveragepy/blob/master/NOTICE.txt

"""Find lines in files that should be faithful copies, and check that they are.

Inside a comment-marked section, any chunk of indented lines should be
faithfully copied from FILENAME.  The indented lines are dedented before
comparing.

The section is between these comments:

    .. copied_from <FILENAME>

    .. end_copied_from

This tool will print any mismatches, and then exit with a count of mismatches.

"""

import glob
from itertools import groupby
from operator import itemgetter
import re
import sys
import textwrap


def check_copied_from(rst_name):
    """Check copies in a .rst file.

    Prints problems.  Returns count of bad copies.
    """
    bad_copies = 0
    file_read = None
    file_text = None
    with open(rst_name) as frst:
        for filename, first_line, text in find_copied_chunks(frst):
            if filename != file_read:
                with open(filename) as f:
                    file_text = f.read()
                file_read = filename
            if text not in file_text:
                print("{}:{}: Bad copy from {}, starting with {!r}".format(
                    rst_name, first_line, filename, text.splitlines()[0]
                ))
                bad_copies += 1

    return bad_copies


def find_copied_chunks(frst):
    """Find chunks of text that are meant to be faithful copies.

    `frst` is an iterable of strings, the .rst text.

    Yields (source_filename, first_line, text) tuples.
    """
    for (_, filename), chunks in groupby(find_copied_lines(frst), itemgetter(0)):
        chunks = list(chunks)
        first_line = chunks[0][1]
        text = textwrap.dedent("\n".join(map(itemgetter(2), chunks)))
        yield filename, first_line, text


def find_copied_lines(frst):
    """Find lines of text that are meant to be faithful copies.

    `frst` is an iterable of strings, the .rst text.

    Yields tuples ((chunk_num, file_name), line_num, line).

    `chunk_num` is an integer that is different for each distinct (blank
    line separated) chunk of text, but has no meaning other than that.

    `file_name` is the file the chunk should be copied from.  `line_num`
    is the line number in the .rst file, and `line` is the text of the line.

    """
    in_section = False
    source_file = None
    chunk_num = 0

    for line_num, line in enumerate(frst, start=1):
        line = line.rstrip()
        if in_section:
            m = re.search(r"^.. end_copied_from", line)
            if m:
                in_section = False
            else:
                if re.search(r"^\s+\S", line):
                    # Indented line
                    yield (chunk_num, source_file), line_num, line
                elif not line.strip():
                    # Blank line
                    chunk_num += 1
        else:
            m = re.search(r"^.. copied_from: (.*)", line)
            if m:
                in_section = True
                source_file = m.group(1)


def main(args):
    """Check all the files in `args`, return count of bad copies."""
    bad_copies = 0
    for arg in args:
        for fname in glob.glob(arg):
            bad_copies += check_copied_from(fname)
    return bad_copies


if __name__ == "__main__":
    sys.exit(main(sys.argv[1:]))