diff options
Diffstat (limited to 'pylint/checkers/similar.py')
-rw-r--r-- | pylint/checkers/similar.py | 372 |
1 files changed, 372 insertions, 0 deletions
diff --git a/pylint/checkers/similar.py b/pylint/checkers/similar.py new file mode 100644 index 0000000..9542077 --- /dev/null +++ b/pylint/checkers/similar.py @@ -0,0 +1,372 @@ +# pylint: disable=W0622 +# Copyright (c) 2004-2013 LOGILAB S.A. (Paris, FRANCE). +# http://www.logilab.fr/ -- mailto:contact@logilab.fr +# +# This program is free software; you can redistribute it and/or modify it under +# the terms of the GNU General Public License as published by the Free Software +# Foundation; either version 2 of the License, or (at your option) any later +# version. +# +# This program is distributed in the hope that it will be useful, but WITHOUT +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS +# FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details +# +# You should have received a copy of the GNU General Public License along with +# this program; if not, write to the Free Software Foundation, Inc., +# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. +"""a similarities / code duplication command line tool and pylint checker +""" +from __future__ import print_function +import sys +from collections import defaultdict + +from logilab.common.ureports import Table + +from pylint.interfaces import IRawChecker +from pylint.checkers import BaseChecker, table_lines_from_stats + +import six +from six.moves import zip + + +class Similar(object): + """finds copy-pasted lines of code in a project""" + + def __init__(self, min_lines=4, ignore_comments=False, + ignore_docstrings=False, ignore_imports=False): + self.min_lines = min_lines + self.ignore_comments = ignore_comments + self.ignore_docstrings = ignore_docstrings + self.ignore_imports = ignore_imports + self.linesets = [] + + def append_stream(self, streamid, stream, encoding=None): + """append a file to search for similarities""" + if encoding is None: + readlines = stream.readlines + else: + readlines = lambda: [line.decode(encoding) for line in stream] + try: + self.linesets.append(LineSet(streamid, + readlines(), + self.ignore_comments, + self.ignore_docstrings, + self.ignore_imports)) + except UnicodeDecodeError: + pass + + def run(self): + """start looking for similarities and display results on stdout""" + self._display_sims(self._compute_sims()) + + def _compute_sims(self): + """compute similarities in appended files""" + no_duplicates = defaultdict(list) + for num, lineset1, idx1, lineset2, idx2 in self._iter_sims(): + duplicate = no_duplicates[num] + for couples in duplicate: + if (lineset1, idx1) in couples or (lineset2, idx2) in couples: + couples.add((lineset1, idx1)) + couples.add((lineset2, idx2)) + break + else: + duplicate.append(set([(lineset1, idx1), (lineset2, idx2)])) + sims = [] + for num, ensembles in six.iteritems(no_duplicates): + for couples in ensembles: + sims.append((num, couples)) + sims.sort() + sims.reverse() + return sims + + def _display_sims(self, sims): + """display computed similarities on stdout""" + nb_lignes_dupliquees = 0 + for num, couples in sims: + print() + print(num, "similar lines in", len(couples), "files") + couples = sorted(couples) + for lineset, idx in couples: + print("==%s:%s" % (lineset.name, idx)) + # pylint: disable=W0631 + for line in lineset._real_lines[idx:idx+num]: + print(" ", line.rstrip()) + nb_lignes_dupliquees += num * (len(couples)-1) + nb_total_lignes = sum([len(lineset) for lineset in self.linesets]) + print("TOTAL lines=%s duplicates=%s percent=%.2f" \ + % (nb_total_lignes, nb_lignes_dupliquees, + nb_lignes_dupliquees*100. / nb_total_lignes)) + + def _find_common(self, lineset1, lineset2): + """find similarities in the two given linesets""" + lines1 = lineset1.enumerate_stripped + lines2 = lineset2.enumerate_stripped + find = lineset2.find + index1 = 0 + min_lines = self.min_lines + while index1 < len(lineset1): + skip = 1 + num = 0 + for index2 in find(lineset1[index1]): + non_blank = 0 + for num, ((_, line1), (_, line2)) in enumerate( + zip(lines1(index1), lines2(index2))): + if line1 != line2: + if non_blank > min_lines: + yield num, lineset1, index1, lineset2, index2 + skip = max(skip, num) + break + if line1: + non_blank += 1 + else: + # we may have reach the end + num += 1 + if non_blank > min_lines: + yield num, lineset1, index1, lineset2, index2 + skip = max(skip, num) + index1 += skip + + def _iter_sims(self): + """iterate on similarities among all files, by making a cartesian + product + """ + for idx, lineset in enumerate(self.linesets[:-1]): + for lineset2 in self.linesets[idx+1:]: + for sim in self._find_common(lineset, lineset2): + yield sim + +def stripped_lines(lines, ignore_comments, ignore_docstrings, ignore_imports): + """return lines with leading/trailing whitespace and any ignored code + features removed + """ + + strippedlines = [] + docstring = None + for line in lines: + line = line.strip() + if ignore_docstrings: + if not docstring and \ + (line.startswith('"""') or line.startswith("'''")): + docstring = line[:3] + line = line[3:] + if docstring: + if line.endswith(docstring): + docstring = None + line = '' + if ignore_imports: + if line.startswith("import ") or line.startswith("from "): + line = '' + if ignore_comments: + # XXX should use regex in checkers/format to avoid cutting + # at a "#" in a string + line = line.split('#', 1)[0].strip() + strippedlines.append(line) + return strippedlines + + +class LineSet(object): + """Holds and indexes all the lines of a single source file""" + def __init__(self, name, lines, ignore_comments=False, + ignore_docstrings=False, ignore_imports=False): + self.name = name + self._real_lines = lines + self._stripped_lines = stripped_lines(lines, ignore_comments, + ignore_docstrings, + ignore_imports) + self._index = self._mk_index() + + def __str__(self): + return '<Lineset for %s>' % self.name + + def __len__(self): + return len(self._real_lines) + + def __getitem__(self, index): + return self._stripped_lines[index] + + def __lt__(self, other): + return self.name < other.name + + def __hash__(self): + return id(self) + + def enumerate_stripped(self, start_at=0): + """return an iterator on stripped lines, starting from a given index + if specified, else 0 + """ + idx = start_at + if start_at: + lines = self._stripped_lines[start_at:] + else: + lines = self._stripped_lines + for line in lines: + #if line: + yield idx, line + idx += 1 + + def find(self, stripped_line): + """return positions of the given stripped line in this set""" + return self._index.get(stripped_line, ()) + + def _mk_index(self): + """create the index for this set""" + index = defaultdict(list) + for line_no, line in enumerate(self._stripped_lines): + if line: + index[line].append(line_no) + return index + + +MSGS = {'R0801': ('Similar lines in %s files\n%s', + 'duplicate-code', + 'Indicates that a set of similar lines has been detected \ + among multiple file. This usually means that the code should \ + be refactored to avoid this duplication.')} + +def report_similarities(sect, stats, old_stats): + """make a layout with some stats about duplication""" + lines = ['', 'now', 'previous', 'difference'] + lines += table_lines_from_stats(stats, old_stats, + ('nb_duplicated_lines', + 'percent_duplicated_lines')) + sect.append(Table(children=lines, cols=4, rheaders=1, cheaders=1)) + + +# wrapper to get a pylint checker from the similar class +class SimilarChecker(BaseChecker, Similar): + """checks for similarities and duplicated code. This computation may be + memory / CPU intensive, so you should disable it if you experiment some + problems. + """ + + __implements__ = (IRawChecker,) + # configuration section name + name = 'similarities' + # messages + msgs = MSGS + # configuration options + # for available dict keys/values see the optik parser 'add_option' method + options = (('min-similarity-lines', + {'default' : 4, 'type' : "int", 'metavar' : '<int>', + 'help' : 'Minimum lines number of a similarity.'}), + ('ignore-comments', + {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', + 'help': 'Ignore comments when computing similarities.'} + ), + ('ignore-docstrings', + {'default' : True, 'type' : 'yn', 'metavar' : '<y or n>', + 'help': 'Ignore docstrings when computing similarities.'} + ), + ('ignore-imports', + {'default' : False, 'type' : 'yn', 'metavar' : '<y or n>', + 'help': 'Ignore imports when computing similarities.'} + ), + ) + # reports + reports = (('RP0801', 'Duplication', report_similarities),) + + def __init__(self, linter=None): + BaseChecker.__init__(self, linter) + Similar.__init__(self, min_lines=4, + ignore_comments=True, ignore_docstrings=True) + self.stats = None + + def set_option(self, optname, value, action=None, optdict=None): + """method called to set an option (registered in the options list) + + overridden to report options setting to Similar + """ + BaseChecker.set_option(self, optname, value, action, optdict) + if optname == 'min-similarity-lines': + self.min_lines = self.config.min_similarity_lines + elif optname == 'ignore-comments': + self.ignore_comments = self.config.ignore_comments + elif optname == 'ignore-docstrings': + self.ignore_docstrings = self.config.ignore_docstrings + elif optname == 'ignore-imports': + self.ignore_imports = self.config.ignore_imports + + def open(self): + """init the checkers: reset linesets and statistics information""" + self.linesets = [] + self.stats = self.linter.add_stats(nb_duplicated_lines=0, + percent_duplicated_lines=0) + + def process_module(self, node): + """process a module + + the module's content is accessible via the stream object + + stream must implement the readlines method + """ + with node.stream() as stream: + self.append_stream(self.linter.current_name, + stream, + node.file_encoding) + + def close(self): + """compute and display similarities on closing (i.e. end of parsing)""" + total = sum([len(lineset) for lineset in self.linesets]) + duplicated = 0 + stats = self.stats + for num, couples in self._compute_sims(): + msg = [] + for lineset, idx in couples: + msg.append("==%s:%s" % (lineset.name, idx)) + msg.sort() + # pylint: disable=W0631 + for line in lineset._real_lines[idx:idx+num]: + msg.append(line.rstrip()) + self.add_message('R0801', args=(len(couples), '\n'.join(msg))) + duplicated += num * (len(couples) - 1) + stats['nb_duplicated_lines'] = duplicated + stats['percent_duplicated_lines'] = total and duplicated * 100. / total + + +def register(linter): + """required method to auto register this checker """ + linter.register_checker(SimilarChecker(linter)) + +def usage(status=0): + """display command line usage information""" + print("finds copy pasted blocks in a set of files") + print() + print('Usage: symilar [-d|--duplicates min_duplicated_lines] \ +[-i|--ignore-comments] [--ignore-docstrings] [--ignore-imports] file1...') + sys.exit(status) + +def Run(argv=None): + """standalone command line access point""" + if argv is None: + argv = sys.argv[1:] + from getopt import getopt + s_opts = 'hdi' + l_opts = ('help', 'duplicates=', 'ignore-comments', 'ignore-imports', + 'ignore-docstrings') + min_lines = 4 + ignore_comments = False + ignore_docstrings = False + ignore_imports = False + opts, args = getopt(argv, s_opts, l_opts) + for opt, val in opts: + if opt in ('-d', '--duplicates'): + min_lines = int(val) + elif opt in ('-h', '--help'): + usage() + elif opt in ('-i', '--ignore-comments'): + ignore_comments = True + elif opt in ('--ignore-docstrings',): + ignore_docstrings = True + elif opt in ('--ignore-imports',): + ignore_imports = True + if not args: + usage(1) + sim = Similar(min_lines, ignore_comments, ignore_docstrings, ignore_imports) + for filename in args: + with open(filename) as stream: + sim.append_stream(filename, stream) + sim.run() + sys.exit(0) + +if __name__ == '__main__': + Run() |