From 019794b808271d45f86a7014e9c91cb04458a47a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Dani=C3=ABl=20van=20Noord?= <13665637+DanielNoord@users.noreply.github.com> Date: Thu, 30 Dec 2021 15:14:20 +0100 Subject: Use ``dill`` to pickle when run in parallel mode (#5609) --- ChangeLog | 3 +++ doc/whatsnew/2.13.rst | 3 +++ pylint/lint/parallel.py | 49 ++++++++++++++++++++++++++++++-------------- setup.cfg | 4 ++++ tests/test_check_parallel.py | 25 ++++++++++++++++++---- 5 files changed, 65 insertions(+), 19 deletions(-) diff --git a/ChangeLog b/ChangeLog index 87cf8b765..19a3b73ea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -11,6 +11,9 @@ Release date: TBA .. Put new features here and also in 'doc/whatsnew/2.13.rst' +* When run in parallel mode ``pylint`` now pickles the data passed to subprocesses with + the ``dill`` package. The ``dill`` package has therefore been added as a dependency. + * ``used-before-assignment`` now considers that assignments in a try block may not have occurred when the except or finally blocks are executed. diff --git a/doc/whatsnew/2.13.rst b/doc/whatsnew/2.13.rst index bf38d767e..9510cd54a 100644 --- a/doc/whatsnew/2.13.rst +++ b/doc/whatsnew/2.13.rst @@ -32,6 +32,9 @@ Extensions Other Changes ============= +* When run in parallel mode ``pylint`` now pickles the data passed to subprocesses with + the ``dill`` package. The ``dill`` package has therefore been added as a dependency. + * By default, pylint does no longer take files starting with ``.#`` into account. Those are considered `emacs file locks`_. This behavior can be reverted by redefining the ``ignore-patterns`` option. diff --git a/pylint/lint/parallel.py b/pylint/lint/parallel.py index eaacb710d..6dbeed983 100644 --- a/pylint/lint/parallel.py +++ b/pylint/lint/parallel.py @@ -3,7 +3,18 @@ import collections import functools -from typing import Any, DefaultDict, Iterable, List, Tuple +from typing import ( + TYPE_CHECKING, + Any, + DefaultDict, + Iterable, + List, + Sequence, + Tuple, + Union, +) + +import dill from pylint import reporters from pylint.lint.utils import _patch_sys_path @@ -16,6 +27,9 @@ try: except ImportError: multiprocessing = None # type: ignore[assignment] +if TYPE_CHECKING: + from pylint.lint import PyLinter + # PyLinter object used by worker processes when checking files using multiprocessing # should only be used by the worker processes _worker_linter = None @@ -33,9 +47,16 @@ def _get_new_args(message): return (message.msg_id, message.symbol, location, message.msg, message.confidence) -def _worker_initialize(linter, arguments=None): +def _worker_initialize( + linter: bytes, arguments: Union[None, str, Sequence[str]] = None +) -> None: + """Function called to initialize a worker for a Process within a multiprocessing Pool + + :param linter: A linter-class (PyLinter) instance pickled with dill + :param arguments: File or module name(s) to lint and to be added to sys.path + """ global _worker_linter # pylint: disable=global-statement - _worker_linter = linter + _worker_linter = dill.loads(linter) # On the worker process side the messages are just collected and passed back to # parent process as _worker_check_file function's return value @@ -97,26 +118,24 @@ def _merge_mapreduce_data(linter, all_mapreduce_data): checker.reduce_map_data(linter, collated_map_reduce_data[checker.name]) -def check_parallel(linter, jobs, files: Iterable[FileItem], arguments=None): +def check_parallel( + linter: "PyLinter", + jobs: int, + files: Iterable[FileItem], + arguments: Union[None, str, Sequence[str]] = None, +) -> None: """Use the given linter to lint the files with given amount of workers (jobs) This splits the work filestream-by-filestream. If you need to do work across multiple files, as in the similarity-checker, then inherit from MapReduceMixin and - implement the map/reduce mixin functionality""" - # The reporter does not need to be passed to worker processes, i.e. the reporter does - original_reporter = linter.reporter - linter.reporter = None - + implement the map/reduce mixin functionality. + """ # The linter is inherited by all the pool's workers, i.e. the linter # is identical to the linter object here. This is required so that # a custom PyLinter object can be used. initializer = functools.partial(_worker_initialize, arguments=arguments) pool = multiprocessing.Pool( # pylint: disable=consider-using-with - jobs, initializer=initializer, initargs=[linter] + jobs, initializer=initializer, initargs=[dill.dumps(linter)] ) - # ...and now when the workers have inherited the linter, the actual reporter - # can be set back here on the parent process so that results get stored into - # correct reporter - linter.set_reporter(original_reporter) linter.open() try: all_stats = [] @@ -141,7 +160,7 @@ def check_parallel(linter, jobs, files: Iterable[FileItem], arguments=None): msg = Message( msg[0], msg[1], MessageLocationTuple(*msg[2]), msg[3], msg[4] ) - linter.reporter.handle_message(msg) # type: ignore[attr-defined] # linter.set_reporter() call above makes linter have a reporter attr + linter.reporter.handle_message(msg) all_stats.append(stats) all_mapreduce_data[worker_idx].append(mapreduce_data) linter.msg_status |= msg_status diff --git a/setup.cfg b/setup.cfg index 2a0bb250a..8b438e645 100644 --- a/setup.cfg +++ b/setup.cfg @@ -43,6 +43,7 @@ project_urls = [options] packages = find: install_requires = + dill>=0.2 platformdirs>=2.2.0 astroid>=2.9.0,<2.10 # (You should also upgrade requirements_test_min.txt) isort>=4.2.5,<6 @@ -123,3 +124,6 @@ ignore_missing_imports = True [mypy-git.*] ignore_missing_imports = True + +[mypy-dill] +ignore_missing_imports = True diff --git a/tests/test_check_parallel.py b/tests/test_check_parallel.py index 01fc352dc..4fa49d148 100644 --- a/tests/test_check_parallel.py +++ b/tests/test_check_parallel.py @@ -11,9 +11,12 @@ # pylint: disable=protected-access,missing-function-docstring,no-self-use +import argparse +import multiprocessing import os from typing import List +import dill import pytest from astroid import nodes @@ -174,8 +177,22 @@ class TestCheckParallelFramework: def test_worker_initialize(self) -> None: linter = PyLinter(reporter=Reporter()) - worker_initialize(linter=linter) - assert pylint.lint.parallel._worker_linter == linter + worker_initialize(linter=dill.dumps(linter)) + assert isinstance(pylint.lint.parallel._worker_linter, type(linter)) + + def test_worker_initialize_pickling(self) -> None: + """Test that we can pickle objects that standard pickling in multiprocessing can't. + + See: + https://stackoverflow.com/questions/8804830/python-multiprocessing-picklingerror-cant-pickle-type-function + https://github.com/PyCQA/pylint/pull/5584 + """ + linter = PyLinter(reporter=Reporter()) + linter.attribute = argparse.ArgumentParser() # type: ignore[attr-defined] + pool = multiprocessing.Pool( # pylint: disable=consider-using-with + 2, initializer=worker_initialize, initargs=[dill.dumps(linter)] + ) + pool.imap_unordered(print, [1, 2]) def test_worker_check_single_file_uninitialised(self) -> None: pylint.lint.parallel._worker_linter = None @@ -186,7 +203,7 @@ class TestCheckParallelFramework: def test_worker_check_single_file_no_checkers(self) -> None: linter = PyLinter(reporter=Reporter()) - worker_initialize(linter=linter) + worker_initialize(linter=dill.dumps(linter)) ( _, # proc-id @@ -225,7 +242,7 @@ class TestCheckParallelFramework: def test_worker_check_sequential_checker(self) -> None: """Same as test_worker_check_single_file_no_checkers with SequentialTestChecker""" linter = PyLinter(reporter=Reporter()) - worker_initialize(linter=linter) + worker_initialize(linter=dill.dumps(linter)) # Add the only checker we care about in this test linter.register_checker(SequentialTestChecker(linter)) -- cgit v1.2.1