1 files changed, 372 insertions, 0 deletions
diff --git a/pylint/checkers/similar.py b/pylint/checkers/similar.py
new file mode 100644
index 0000000..9542077
--- /dev/null
+++ b/pylint/checkers/similar.py
@@ -0,0 +1,372 @@
+# pylint: disable=W0622
+# Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE).
+# http://www.logilab.fr/ -- mailto:contact@logilab.fr
+#
+# This program is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License as published by the Free Software
+# Foundation; either version 2 of the License, or (at your option) any later
+# version.
+#
+# This program is distributed in the hope that it will be useful, but WITHOUT
+# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
+# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details
+#
+# You should have received a copy of the GNU General Public License along with
+# this program; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
+"""a similarities / code duplication command line tool and pylint checker
+"""
+from __future__ import print_function
+import sys
+from collections import defaultdict
+
+from logilab.common.ureports import Table
+
+from pylint.interfaces import IRawChecker
+from pylint.checkers import BaseChecker, table_lines_from_stats
+
+import six
+from six.moves import zip
+
+
+class Similar(object):
+    """finds copy-pasted lines of code in a project"""
+
+    def __init__(self, min_lines=4, ignore_comments=False,
+                 ignore_docstrings=False, ignore_imports=False):
+        self.min_lines = min_lines
+        self.ignore_comments = ignore_comments
+        self.ignore_docstrings = ignore_docstrings
+        self.ignore_imports = ignore_imports
+        self.linesets = []
+
+    def append_stream(self, streamid, stream, encoding=None):
+        """append a file to search for similarities"""
+        if encoding is None:
+            readlines = stream.readlines
+        else:
+            readlines = lambda: [line.decode(encoding) for line in stream]
+        try:
+            self.linesets.append(LineSet(streamid,
+                                         readlines(),
+                                         self.ignore_comments,
+                                         self.ignore_docstrings,
+                                         self.ignore_imports))
+        except UnicodeDecodeError:
+            pass
+
+    def run(self):
+        """start looking for similarities and display results on stdout"""
+        self._display_sims(self._compute_sims())
+
+    def _compute_sims(self):
+        """compute similarities in appended files"""
+        no_duplicates = defaultdict(list)
+        for num, lineset1, idx1, lineset2, idx2 in self._iter_sims():
+            duplicate = no_duplicates[num]
+            for couples in duplicate:
+                if (lineset1, idx1) in couples or (lineset2, idx2) in couples:
+                    couples.add((lineset1, idx1))
+                    couples.add((lineset2, idx2))
+                    break
+            else:
+                duplicate.append(set([(lineset1, idx1), (lineset2, idx2)]))
+        sims = []
+        for num, ensembles in six.iteritems(no_duplicates):
+            for couples in ensembles:
+                sims.append((num, couples))
+        sims.sort()
+        sims.reverse()
+        return sims
+
+    def _display_sims(self, sims):
+        """display computed similarities on stdout"""
+        nb_lignes_dupliquees = 0
+        for num, couples in sims:
+            print()
+            print(num, "similar lines in", len(couples), "files")
+            couples = sorted(couples)
+            for lineset, idx in couples:
+                print("==%s:%s" % (lineset.name, idx))
+            # pylint: disable=W0631
+            for line in lineset._real_lines[idx:idx+num]:
+                print("  ", line.rstrip())
+            nb_lignes_dupliquees += num * (len(couples)-1)
+        nb_total_lignes = sum([len(lineset) for lineset in self.linesets])
+        print("TOTAL lines=%s duplicates=%s percent=%.2f" \
+            % (nb_total_lignes, nb_lignes_dupliquees,
+               nb_lignes_dupliquees*100. / nb_total_lignes))
+
+    def _find_common(self, lineset1, lineset2):
+        """find similarities in the two given linesets"""
+        lines1 = lineset1.enumerate_stripped
+        lines2 = lineset2.enumerate_stripped
+        find = lineset2.find
+        index1 = 0
+        min_lines = self.min_lines
+        while index1 < len(lineset1):
+            skip = 1
+            num = 0
+            for index2 in find(lineset1[index1]):
+                non_blank = 0
+                for num, ((_, line1), (_, line2)) in enumerate(
+                        zip(lines1(index1), lines2(index2))):
+                    if line1 != line2:
+                        if non_blank > min_lines:
+                            yield num, lineset1, index1, lineset2, index2
+                        skip = max(skip, num)
+                        break
+                    if line1:
+                        non_blank += 1
+                else:
+                    # we may have reach the end
+                    num += 1
+                    if non_blank > min_lines:
+                        yield num, lineset1, index1, lineset2, index2
+                    skip = max(skip, num)
+            index1 += skip
+
+    def _iter_sims(self):
+        """iterate on similarities among all files, by making a cartesian
+        product
+        """
+        for idx, lineset in enumerate(self.linesets[:-1]):
+            for lineset2 in self.linesets[idx+1:]:
+                for sim in self._find_common(lineset, lineset2):
+                    yield sim
+
+def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports):
+    """return lines with leading/trailing whitespace and any ignored code
+    features removed
+    """
+
+    strippedlines = []
+    docstring = None
+    for line in lines:
+        line = line.strip()
+        if ignore_docstrings:
+            if not docstring and \
+                   (line.startswith('"""') or line.startswith("'''")):
+                docstring = line[:3]
+                line = line[3:]
+            if docstring:
+                if line.endswith(docstring):
+                    docstring = None
+                line = ''
+        if ignore_imports:
+            if line.startswith("import ") or line.startswith("from "):
+                line = ''
+        if ignore_comments:
+            # XXX should use regex in checkers/format to avoid cutting
+            # at a "#" in a string
+            line = line.split('#', 1)[0].strip()
+        strippedlines.append(line)
+    return strippedlines
+
+
+class LineSet(object):
+    """Holds and indexes all the lines of a single source file"""
+    def __init__(self, name, lines, ignore_comments=False,
+                 ignore_docstrings=False, ignore_imports=False):
+        self.name = name
+        self._real_lines = lines
+        self._stripped_lines = stripped_lines(lines, ignore_comments,
+                                              ignore_docstrings,
+                                              ignore_imports)
+        self._index = self._mk_index()
+
+    def __str__(self):
+        return '<Lineset for %s>' % self.name
+
+    def __len__(self):
+        return len(self._real_lines)
+
+    def __getitem__(self, index):
+        return self._stripped_lines[index]
+
+    def __lt__(self, other):
+        return self.name < other.name
+
+    def __hash__(self):
+        return id(self)
+
+    def enumerate_stripped(self, start_at=0):
+        """return an iterator on stripped lines, starting from a given index
+        if specified, else 0
+        """
+        idx = start_at
+        if start_at:
+            lines = self._stripped_lines[start_at:]
+        else:
+            lines = self._stripped_lines
+        for line in lines:
+            #if line:
+            yield idx, line
+            idx += 1
+
+    def find(self, stripped_line):
+        """return positions of the given stripped line in this set"""
+        return self._index.get(stripped_line, ())
+
+    def _mk_index(self):
+        """create the index for this set"""
+        index = defaultdict(list)
+        for line_no, line in enumerate(self._stripped_lines):
+            if line:
+                index[line].append(line_no)
+        return index
+
+
+MSGS = {'R0801': ('Similar lines in %s files\n%s',
+                  'duplicate-code',
+                  'Indicates that a set of similar lines has been detected \
+                  among multiple file. This usually means that the code should \
+                  be refactored to avoid this duplication.')}
+
+def report_similarities(sect, stats, old_stats):
+    """make a layout with some stats about duplication"""
+    lines = ['', 'now', 'previous', 'difference']
+    lines += table_lines_from_stats(stats, old_stats,
+                                    ('nb_duplicated_lines',
+                                     'percent_duplicated_lines'))
+    sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1))
+
+
+# wrapper to get a pylint checker from the similar class
+class SimilarChecker(BaseChecker, Similar):
+    """checks for similarities and duplicated code. This computation may be
+    memory / CPU intensive, so you should disable it if you experiment some
+    problems.
+    """
+
+    __implements__ = (IRawChecker,)
+    # configuration section name
+    name = 'similarities'
+    # messages
+    msgs = MSGS
+    # configuration options
+    # for available dict keys/values see the optik parser 'add_option' method
+    options = (('min-similarity-lines',
+                {'default' : 4, 'type' : "int", 'metavar' : '<int>',
+                 'help' : 'Minimum lines number of a similarity.'}),
+               ('ignore-comments',
+                {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
+                 'help': 'Ignore comments when computing similarities.'}
+               ),
+               ('ignore-docstrings',
+                {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>',
+                 'help': 'Ignore docstrings when computing similarities.'}
+               ),
+               ('ignore-imports',
+                {'default' : False, 'type' : 'yn', 'metavar' : '<y or n>',
+                 'help': 'Ignore imports when computing similarities.'}
+               ),
+              )
+    # reports
+    reports = (('RP0801', 'Duplication', report_similarities),)
+
+    def __init__(self, linter=None):
+        BaseChecker.__init__(self, linter)
+        Similar.__init__(self, min_lines=4,
+                         ignore_comments=True, ignore_docstrings=True)
+        self.stats = None
+
+    def set_option(self, optname, value, action=None, optdict=None):
+        """method called to set an option (registered in the options list)
+
+        overridden to report options setting to Similar
+        """
+        BaseChecker.set_option(self, optname, value, action, optdict)
+        if optname == 'min-similarity-lines':
+            self.min_lines = self.config.min_similarity_lines
+        elif optname == 'ignore-comments':
+            self.ignore_comments = self.config.ignore_comments
+        elif optname == 'ignore-docstrings':
+            self.ignore_docstrings = self.config.ignore_docstrings
+        elif optname == 'ignore-imports':
+            self.ignore_imports = self.config.ignore_imports
+
+    def open(self):
+        """init the checkers: reset linesets and statistics information"""
+        self.linesets = []
+        self.stats = self.linter.add_stats(nb_duplicated_lines=0,
+                                           percent_duplicated_lines=0)
+
+    def process_module(self, node):
+        """process a module
+
+        the module's content is accessible via the stream object
+
+        stream must implement the readlines method
+        """
+        with node.stream() as stream:
+            self.append_stream(self.linter.current_name,
+                               stream,
+                               node.file_encoding)
+
+    def close(self):
+        """compute and display similarities on closing (i.e. end of parsing)"""
+        total = sum([len(lineset) for lineset in self.linesets])
+        duplicated = 0
+        stats = self.stats
+        for num, couples in self._compute_sims():
+            msg = []
+            for lineset, idx in couples:
+                msg.append("==%s:%s" % (lineset.name, idx))
+            msg.sort()
+            # pylint: disable=W0631
+            for line in lineset._real_lines[idx:idx+num]:
+                msg.append(line.rstrip())
+            self.add_message('R0801', args=(len(couples), '\n'.join(msg)))
+            duplicated += num * (len(couples) - 1)
+        stats['nb_duplicated_lines'] = duplicated
+        stats['percent_duplicated_lines'] = total and duplicated * 100. / total
+
+
+def register(linter):
+    """required method to auto register this checker """
+    linter.register_checker(SimilarChecker(linter))
+
+def usage(status=0):
+    """display command line usage information"""
+    print("finds copy pasted blocks in a set of files")
+    print()
+    print('Usage: symilar [-d|--duplicates min_duplicated_lines] \
+[-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...')
+    sys.exit(status)
+
+def Run(argv=None):
+    """standalone command line access point"""
+    if argv is None:
+        argv = sys.argv[1:]
+    from getopt import getopt
+    s_opts = 'hdi'
+    l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports',
+              'ignore-docstrings')
+    min_lines = 4
+    ignore_comments = False
+    ignore_docstrings = False
+    ignore_imports = False
+    opts, args = getopt(argv, s_opts, l_opts)
+    for opt, val in opts:
+        if opt in ('-d', '--duplicates'):
+            min_lines = int(val)
+        elif opt in ('-h', '--help'):
+            usage()
+        elif opt in ('-i', '--ignore-comments'):
+            ignore_comments = True
+        elif opt in ('--ignore-docstrings',):
+            ignore_docstrings = True
+        elif opt in ('--ignore-imports',):
+            ignore_imports = True
+    if not args:
+        usage(1)
+    sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports)
+    for filename in args:
+        with open(filename) as stream:
+            sim.append_stream(filename, stream)
+    sim.run()
+    sys.exit(0)
+
+if __name__ == '__main__':
+    Run()