From 1d7436935f00379d00682c9ec1059b3a1c279fce Mon Sep 17 00:00:00 2001 From: Matt Clay Date: Tue, 6 Dec 2022 14:29:45 -0800 Subject: [stable-2.13] ansible-test - Improve container management. (#79538) * ansible-test - More flexible become support. (cherry picked from commit 5666c6d6a3207f279cbb3ce0b0bd070ab5a9ecd4) * ansible-test - Add support for more remotes. (cherry picked from commit 24d91f552cad2a485f286f3c34cbba2005599ab4) * ansible-test - Enable ACLs on FreeBSD remotes. This allows integration tests to switch from one unprivileged user to another, sharing files between the users using ACLs. (cherry picked from commit b63812bc08fd00fd772c28a2604f77f487d23104) * ansible-test - Enable ACL support on more remotes. (#78299) (cherry picked from commit 8bb20fe06b458e6d7c4290dcb86cced3bce9d825) * ansible-test - Install `acl` on Alpine remotes. (#78303) (cherry picked from commit 8565deaae393a136ada071c27a8b7791d3640e41) * ansible-test - Use sudo for all remotes. (#78310) (cherry picked from commit f3f4ad93325725b91902984b419502e8b4b97188) * ansible-test - Fix Ubuntu 20.04 bootstrapping. (#78551) (cherry picked from commit d683c5bd212375c9ffc785ab65ee6e68ea913c8b) * ansible-test - Verify executables are executable. (#78606) (cherry picked from commit ece85abbc46e087187caf6e05b1515b97c578531) * ansible-test - Use --tmpfs to run containers. (#78605) (cherry picked from commit 4187707f035a5dde9d02e99e5dec40d71b06d5d1) * ansible-test - Remove Fedora 35 support. (#78720) (cherry picked from commit 38a82a5cc43ed4b48cf5fbc3addc5d2993c37eef) * ansible-test - Fix container error handling. (#78739) (cherry picked from commit 79f67ed56116be11b1c992fade04acf06d9208d1) * ansible-test - Improve container management. (#78550) See changelogs/fragments/ansible-test-container-management.yml for details. (cherry picked from commit cda16cc5e9aa8703fb4e1ac0a0be6b631d9076cc) * ansible-test - Fix container detection. (#79530) (cherry picked from commit 80d2f8da02052f64396da6b8caaf820eedbf18e2) * Update test matrix symlinks. (#78737) * Add remote platform symlinks. These were already supported by ansible-test. This change adds the symlinks for use in the CI test matrix. * Remove obsolete AIX test symlink. AIX was already removed from ansible-test. (cherry picked from commit 62221a3be214082610ba33c62e82bc3a778be7fb) * Use patched containers without VOLUME instruction. * Make test suite compatible with older ansible. * Fix compatibility with Python 3.8. --- .azure-pipelines/azure-pipelines.yml | 17 + .../ansible-test-container-management.yml | 63 ++ .../fragments/ansible-test-container-tmpfs.yml | 5 + .../fragments/ansible-test-generalize-become.yml | 2 + changelogs/fragments/ansible-test-more-remotes.yml | 5 + .../fragments/ansible-test-podman-create-retry.yml | 3 + changelogs/fragments/ansible-test-remote-acl.yml | 5 + .../fragments/ansible-test-remote-become.yml | 3 + .../ansible-test-ubuntu-bootstrap-fix.yml | 2 + .../fragments/ansible-test-verify-executables.yml | 6 + .../rst/community/create_pr_quick_start.rst | 8 +- .../testing/sanity/integration-aliases.rst | 1 + .../rst/dev_guide/testing_running_locally.rst | 345 ++++++- .../targets/ansible-test-container/aliases | 5 + .../targets/ansible-test-container/runme.py | 1053 ++++++++++++++++++++ .../targets/ansible-test-container/runme.sh | 5 + test/lib/ansible_test/_data/completion/docker.txt | 22 +- test/lib/ansible_test/_data/completion/remote.txt | 27 +- test/lib/ansible_test/_internal/__init__.py | 9 + test/lib/ansible_test/_internal/ansible_util.py | 5 + test/lib/ansible_test/_internal/become.py | 57 ++ test/lib/ansible_test/_internal/cgroup.py | 110 ++ .../_internal/cli/argparsing/parsers.py | 13 + .../lib/ansible_test/_internal/cli/environments.py | 20 + .../_internal/cli/parsers/key_value_parsers.py | 16 + .../_internal/commands/env/__init__.py | 23 +- .../_internal/commands/integration/__init__.py | 4 + .../_internal/commands/integration/coverage.py | 5 +- .../commands/sanity/integration_aliases.py | 3 + .../_internal/commands/shell/__init__.py | 19 +- test/lib/ansible_test/_internal/completion.py | 51 + test/lib/ansible_test/_internal/config.py | 3 + test/lib/ansible_test/_internal/constants.py | 2 + test/lib/ansible_test/_internal/containers.py | 148 ++- test/lib/ansible_test/_internal/coverage_util.py | 5 + test/lib/ansible_test/_internal/delegation.py | 12 + test/lib/ansible_test/_internal/dev/__init__.py | 2 + .../ansible_test/_internal/dev/container_probe.py | 216 ++++ test/lib/ansible_test/_internal/docker_util.py | 778 +++++++++++---- test/lib/ansible_test/_internal/host_configs.py | 20 + test/lib/ansible_test/_internal/host_profiles.py | 666 ++++++++++++- test/lib/ansible_test/_internal/provisioning.py | 16 +- test/lib/ansible_test/_internal/target.py | 2 + test/lib/ansible_test/_internal/thread.py | 24 + test/lib/ansible_test/_internal/util.py | 27 + test/lib/ansible_test/_internal/util_common.py | 9 +- .../sanity/pylint/config/ansible-test.cfg | 1 + .../controller/sanity/pylint/plugins/unwanted.py | 9 + .../ansible_test/_util/target/setup/bootstrap.sh | 123 ++- .../_util/target/setup/check_systemd_cgroup_v1.sh | 17 + .../_util/target/setup/probe_cgroups.py | 31 + test/utils/shippable/aix.sh | 1 - test/utils/shippable/alpine.sh | 1 + test/utils/shippable/fedora.sh | 1 + test/utils/shippable/ubuntu.sh | 1 + 55 files changed, 3727 insertions(+), 300 deletions(-) create mode 100644 changelogs/fragments/ansible-test-container-management.yml create mode 100644 changelogs/fragments/ansible-test-container-tmpfs.yml create mode 100644 changelogs/fragments/ansible-test-generalize-become.yml create mode 100644 changelogs/fragments/ansible-test-more-remotes.yml create mode 100644 changelogs/fragments/ansible-test-podman-create-retry.yml create mode 100644 changelogs/fragments/ansible-test-remote-acl.yml create mode 100644 changelogs/fragments/ansible-test-remote-become.yml create mode 100644 changelogs/fragments/ansible-test-ubuntu-bootstrap-fix.yml create mode 100644 changelogs/fragments/ansible-test-verify-executables.yml create mode 100644 test/integration/targets/ansible-test-container/aliases create mode 100755 test/integration/targets/ansible-test-container/runme.py create mode 100755 test/integration/targets/ansible-test-container/runme.sh create mode 100644 test/lib/ansible_test/_internal/cgroup.py create mode 100644 test/lib/ansible_test/_internal/dev/__init__.py create mode 100644 test/lib/ansible_test/_internal/dev/container_probe.py create mode 100644 test/lib/ansible_test/_util/target/setup/check_systemd_cgroup_v1.sh create mode 100644 test/lib/ansible_test/_util/target/setup/probe_cgroups.py delete mode 120000 test/utils/shippable/aix.sh create mode 120000 test/utils/shippable/alpine.sh create mode 120000 test/utils/shippable/fedora.sh create mode 120000 test/utils/shippable/ubuntu.sh diff --git a/.azure-pipelines/azure-pipelines.yml b/.azure-pipelines/azure-pipelines.yml index 6babf1f425..5dd0eb2771 100644 --- a/.azure-pipelines/azure-pipelines.yml +++ b/.azure-pipelines/azure-pipelines.yml @@ -106,6 +106,23 @@ stages: - 3 - 4 - 5 + - template: templates/matrix.yml # context/controller (ansible-test container management) + parameters: + targets: + - name: Alpine 3.16 + test: alpine/3.16 + - name: Fedora 36 + test: fedora/36 + - name: RHEL 8.5 + test: rhel/8.5 + - name: RHEL 9.0 + test: rhel/9.0 + - name: Ubuntu 20.04 + test: ubuntu/20.04 + - name: Ubuntu 22.04 + test: ubuntu/22.04 + groups: + - 6 - stage: Docker dependsOn: [] jobs: diff --git a/changelogs/fragments/ansible-test-container-management.yml b/changelogs/fragments/ansible-test-container-management.yml new file mode 100644 index 0000000000..04961b98ee --- /dev/null +++ b/changelogs/fragments/ansible-test-container-management.yml @@ -0,0 +1,63 @@ +major_changes: + - ansible-test - Docker and Podman are now supported on hosts with cgroup v2 unified. + Previously only cgroup v1 and cgroup v2 hybrid were supported. + - ansible-test - Docker Desktop on WSL2 is now supported (additional configuration required). + - ansible-test - Podman on WSL2 is now supported. + - ansible-test - Podman now works on container hosts without systemd. + Previously only some containers worked, while others required rootfull or rootless Podman, + but would not work with both. Some containers did not work at all. + - ansible-test - When additional cgroup setup is required on the container host, this will be automatically detected. + Instructions on how to configure the host will be provided in the error message shown. +minor_changes: + - ansible-test - When using Podman, ansible-test will detect if the loginuid used in containers is incorrect. + When this occurs a warning is displayed and the container is run with the AUDIT_CONTROL capability. + Previously containers would fail under this situation, with no useful warnings or errors given. + - ansible-test - Failure to connect to a container over SSH now results in a clear error. + Previously tests would be attempted even after initial connection attempts failed. + - ansible-test - Warnings are now shown when using containers that were built with VOLUME instructions. + - ansible-test - Unit tests now support network disconnect by default when running under Podman. + Previously this feature only worked by default under Docker. + - ansible-test - Additional log details are shown when containers fail to start or SSH connections to containers fail. + - ansible-test - Containers included with ansible-test no longer disable seccomp by default. + - ansible-test - A new ``cgroup`` option is available when running custom containers. + This option can be used to indicate a container requires cgroup v1 or that it does not use cgroup. + The default behavior assumes the container works with cgroup v2 (as well as v1). + - ansible-test - A new ``audit`` option is available when running custom containers. + This option can be used to indicate whether a container requires the AUDIT_WRITE capability. + The default is ``required``, which most containers will need when using Podman. + If necessary, the ``none`` option can be used to opt-out of the capability. + This has no effect on Docker, which always provides the capability. + - ansible-test - More details are provided about an instance when provisioning fails. + - ansible-test - Connection failures to remote provisioned hosts now show failure details as a warning. + - ansible-test - When setting the max open files for containers, the container host's limit will be checked. + If the host limit is lower than the preferred value, it will be used and a warning will be shown. + - ansible-test - Use ``stop --time 0`` followed by ``rm`` to remove ephemeral containers instead of ``rm -f``. + This speeds up teardown of ephemeral containers. + - ansible-test - Reduce the polling limit for SSHD startup in containers from 60 retries to 10. + The one second delay between retries remains in place. + - ansible-test - Integration tests can be excluded from retries triggered by the ``--retry-on-error`` option by + adding the ``retry/never`` alias. This is useful for tests that cannot pass on a retry or are too + slow to make retries useful. + - ansible-test - The ``ansible-test env`` command now detects and reports the container ID if running in a container. +bugfixes: + - ansible-test - Multiple containers now work under Podman without specifying the ``--docker-network`` option. + - ansible-test - Prevent concurrent / repeat pulls of the same container image. + - ansible-test - Prevent concurrent / repeat inspections of the same container image. + - ansible-test - Prevent concurrent execution of cached methods. + - ansible-test - Handle server errors when executing the ``docker info`` command. + - ansible-test - Show the exception type when reporting errors during instance provisioning. + - ansible-test - Pass the ``XDG_RUNTIME_DIR`` environment variable through to container commands. + - ansible-test - Connection attempts to managed remote instances no longer abort on ``Permission denied`` errors. + - ansible-test - Detection for running in a Podman or Docker container has been fixed to detect more scenarios. + The new detection relies on ``/proc/self/mountinfo`` instead of ``/proc/self/cpuset``. + Detection now works with custom cgroups and private cgroup namespaces. +known_issues: + - ansible-test - Using Docker on systems with SELinux may require setting SELinux to permissive mode. + Podman should work with SELinux in enforcing mode. + - ansible-test - Additional configuration may be required for certain container host and container combinations. + Further details are available in the testing documentation. + - ansible-test - Systems with Podman networking issues may be unable to run containers, when previously the issue + went unreported. Correct the networking issues to continue using ``ansible-test`` with Podman. + - ansible-test - Custom containers with ``VOLUME`` instructions may be unable to start, when previously the containers + started correctly. Remove the ``VOLUME`` instructions to resolve the issue. Containers with this + condition will cause ``ansible-test`` to emit a warning. diff --git a/changelogs/fragments/ansible-test-container-tmpfs.yml b/changelogs/fragments/ansible-test-container-tmpfs.yml new file mode 100644 index 0000000000..678cd0770d --- /dev/null +++ b/changelogs/fragments/ansible-test-container-tmpfs.yml @@ -0,0 +1,5 @@ +bugfixes: + - ansible-test - Test containers are now run with the ``--tmpfs`` option for ``/tmp``, ``/run`` and ``/run/lock``. + This allows use of containers built without the ``VOLUME`` instruction. + Additionally, containers with those volumes defined no longer create anonymous volumes for them. + This avoids leaving behind volumes on the container host after the container is stopped and deleted. diff --git a/changelogs/fragments/ansible-test-generalize-become.yml b/changelogs/fragments/ansible-test-generalize-become.yml new file mode 100644 index 0000000000..1831c05228 --- /dev/null +++ b/changelogs/fragments/ansible-test-generalize-become.yml @@ -0,0 +1,2 @@ +minor_changes: + - ansible-test - Become support for remote instance provisioning is no longer tied to a fixed list of platforms. diff --git a/changelogs/fragments/ansible-test-more-remotes.yml b/changelogs/fragments/ansible-test-more-remotes.yml new file mode 100644 index 0000000000..7eb1615011 --- /dev/null +++ b/changelogs/fragments/ansible-test-more-remotes.yml @@ -0,0 +1,5 @@ +minor_changes: + - ansible-test - Add support for provisioning remotes which require ``doas`` for become. + - ansible-test - Add support for provisioning Ubuntu 20.04 remote instances. + - ansible-test - Add support for provisioning Alpine 3.16 remote instances. + - ansible-test - Add support for provisioning Fedora 36 remote instances. diff --git a/changelogs/fragments/ansible-test-podman-create-retry.yml b/changelogs/fragments/ansible-test-podman-create-retry.yml new file mode 100644 index 0000000000..7416e89f79 --- /dev/null +++ b/changelogs/fragments/ansible-test-podman-create-retry.yml @@ -0,0 +1,3 @@ +bugfixes: + - ansible-test - Always remove containers after failing to create/run them. + This avoids leaving behind created containers when using podman. diff --git a/changelogs/fragments/ansible-test-remote-acl.yml b/changelogs/fragments/ansible-test-remote-acl.yml new file mode 100644 index 0000000000..79ff7e5148 --- /dev/null +++ b/changelogs/fragments/ansible-test-remote-acl.yml @@ -0,0 +1,5 @@ +minor_changes: + - ansible-test - Remote FreeBSD instances now have ACLs enabled on the root filesystem. + - ansible-test - Remote Fedora instances now have the ``acl`` package installed. + - ansible-test - Remote Ubuntu instances now have the ``acl`` package installed. + - ansible-test - Remote Alpine instances now have the ``acl`` package installed. diff --git a/changelogs/fragments/ansible-test-remote-become.yml b/changelogs/fragments/ansible-test-remote-become.yml new file mode 100644 index 0000000000..031cac34ba --- /dev/null +++ b/changelogs/fragments/ansible-test-remote-become.yml @@ -0,0 +1,3 @@ +minor_changes: + - ansible-test - Alpine remotes now use ``sudo`` for tests, using ``doas`` only for bootstrapping. + - ansible-test - FreeBSD remotes now use ``sudo`` for tests, using ``su`` only for bootstrapping. diff --git a/changelogs/fragments/ansible-test-ubuntu-bootstrap-fix.yml b/changelogs/fragments/ansible-test-ubuntu-bootstrap-fix.yml new file mode 100644 index 0000000000..92666bed73 --- /dev/null +++ b/changelogs/fragments/ansible-test-ubuntu-bootstrap-fix.yml @@ -0,0 +1,2 @@ +bugfixes: + - ansible-test - Fix bootstrapping of Python 3.9 on Ubuntu 20.04 remotes. diff --git a/changelogs/fragments/ansible-test-verify-executables.yml b/changelogs/fragments/ansible-test-verify-executables.yml new file mode 100644 index 0000000000..a1eff95d09 --- /dev/null +++ b/changelogs/fragments/ansible-test-verify-executables.yml @@ -0,0 +1,6 @@ +bugfixes: + - ansible-test - Temporary executables are now verified as executable after creation. + Without this check, path injected scripts may not be found, + typically on systems with ``/tmp`` mounted using the "noexec" option. + This can manifest as a missing Python interpreter, or use of the wrong Python interpreter, as well + as other error conditions. diff --git a/docs/docsite/rst/community/create_pr_quick_start.rst b/docs/docsite/rst/community/create_pr_quick_start.rst index d06f58ddd7..d645e8b940 100644 --- a/docs/docsite/rst/community/create_pr_quick_start.rst +++ b/docs/docsite/rst/community/create_pr_quick_start.rst @@ -16,7 +16,7 @@ Prepare your environment These steps assume a Linux work environment with ``git`` installed. -1. Install and start ``docker`` or ``podman`` with the ``docker`` executable shim. This insures tests run properly isolated and in the exact environments as in CI. The latest ``ansible-core`` development version also supports the ``podman`` CLI program. +1. Install and start ``docker`` or ``podman``. This ensures tests run properly isolated and in the same environment as in CI. 2. :ref:`Install Ansible or ansible-core `. You need the ``ansible-test`` utility which is provided by either of these packages. @@ -157,7 +157,7 @@ Test your changes 1. Install ``flake8`` (``pip install flake8``, or install the corresponding package on your operating system). -1. Run ``flake8`` against a changed file: +2. Run ``flake8`` against a changed file: .. code-block:: bash @@ -167,7 +167,7 @@ Test your changes This shows unused imports, which is not shown by sanity tests, as well as other common issues. Optionally, you can use the ``--max-line-length=160`` command-line argument. -2. Run sanity tests: +3. Run sanity tests: .. code-block:: bash @@ -176,7 +176,7 @@ Test your changes If they failed, look at the output carefully - it is informative and helps to identify a problem line quickly. Sanity failings usually relate to incorrect code and documentation formatting. -3. Run integration tests: +4. Run integration tests: .. code-block:: bash diff --git a/docs/docsite/rst/dev_guide/testing/sanity/integration-aliases.rst b/docs/docsite/rst/dev_guide/testing/sanity/integration-aliases.rst index 39a03126a8..e14188d6cc 100644 --- a/docs/docsite/rst/dev_guide/testing/sanity/integration-aliases.rst +++ b/docs/docsite/rst/dev_guide/testing/sanity/integration-aliases.rst @@ -96,6 +96,7 @@ There are several other aliases available as well: - ``destructive`` - Requires ``--allow-destructive`` to run without ``--docker`` or ``--remote``. - ``hidden`` - Target is ignored. Usable as a dependency. Automatic for ``setup_`` and ``prepare_`` prefixed targets. +- ``retry/never`` - Target is excluded from retries enabled by the ``--retry-on-error`` option. Unstable -------- diff --git a/docs/docsite/rst/dev_guide/testing_running_locally.rst b/docs/docsite/rst/dev_guide/testing_running_locally.rst index dcf7e6d9f7..40910a87ec 100644 --- a/docs/docsite/rst/dev_guide/testing_running_locally.rst +++ b/docs/docsite/rst/dev_guide/testing_running_locally.rst @@ -2,44 +2,329 @@ .. _testing_running_locally: -*************** -Testing Ansible -*************** +******************************* +Testing Ansible and Collections +******************************* -This document describes how to: - -* Run tests locally using ``ansible-test`` -* Extend +This document describes how to run tests using ``ansible-test``. .. contents:: :local: -Requirements -============ +Setup +===== -There are no special requirements for running ``ansible-test`` on Python 2.7 or later. -The ``argparse`` package is required for Python 2.6. -The requirements for each ``ansible-test`` command are covered later. +Before running ``ansible-test``, set up your environment for :ref:`Testing an Ansible Collection` or +:ref:`Testing ansible-core`, depending on which scenario applies to you. +.. warning:: -Test Environments -================= + If you use ``git`` for version control, make sure the files you are working with are not ignored by ``git``. + If they are, ``ansible-test`` will ignore them as well. + +Testing an Ansible Collection +----------------------------- + +If you are testing an Ansible Collection, you need a copy of the collection, preferably a git clone. +For example, to work with the ``community.windows`` collection, follow these steps: + +1. Clone the collection you want to test into a valid collection root: + + .. code-block:: shell + + git clone https://github.com/ansible-collections/community.windows ~/dev/ansible_collections/community/windows + + .. important:: + + The path must end with ``/ansible_collections/{collection_namespace}/{collection_name}`` where + ``{collection_namespace}`` is the namespace of the collection and ``{collection_name}`` is the collection name. + +2. Clone any collections on which the collection depends: + + .. code-block:: shell + + git clone https://github.com/ansible-collections/ansible.windows ~/dev/ansible_collections/ansible/windows + + .. important:: + + If your collection has any dependencies on other collections, they must be in the same collection root, since + ``ansible-test`` will not use your configured collection roots (or other Ansible configuration). + + .. note:: + + See the collection's ``galaxy.yml`` for a list of possible dependencies. + +3. Switch to the directory where the collection to test resides: + + .. code-block:: shell + + cd ~/dev/ansible_collections/community/windows + +Testing ``ansible-core`` +------------------------ + +If you are testing ``ansible-core`` itself, you need a copy of the ``ansible-core`` source code, preferably a git clone. +Having an installed copy of ``ansible-core`` is not sufficient or required. +For example, to work with the ``ansible-core`` source cloned from GitHub, follow these steps: + +1. Clone the ``ansible-core`` repository: + + .. code-block:: shell + + git clone https://github.com/ansible/ansible ~/dev/ansible + +2. Switch to the directory where the ``ansible-core`` source resides: + + .. code-block:: shell + + cd ~/dev/ansible + +3. Add ``ansible-core`` programs to your ``PATH``: + + .. code-block:: shell + + source hacking/env-setup + + .. note:: + + You can skip this step if you only need to run ``ansible-test``, and not other ``ansible-core`` programs. + In that case, simply run ``bin/ansible-test`` from the root of the ``ansible-core`` source. + + .. caution:: + + If you have an installed version of ``ansible-core`` and are trying to run ``ansible-test`` from your ``PATH``, + make sure the program found by your shell is the one from the ``ansible-core`` source: + + .. code-block:: shell + + which ansible-test + +Commands +======== + +The most commonly used test commands are: + +* ``ansible-test sanity`` - Run sanity tests (mostly linters and static analysis). +* ``ansible-test integration`` - Run integration tests. +* ``ansible-test units`` - Run unit tests. + +Run ``ansible-test --help`` to see a complete list of available commands. + +.. note:: + + For detailed help on a specific command, add the ``--help`` option after the command. + +Environments +============ Most ``ansible-test`` commands support running in one or more isolated test environments to simplify testing. +Containers +---------- + +Containers are recommended for running sanity, unit and integration tests, since they provide consistent environments. +Unit tests will be run with network isolation, which avoids unintentional dependencies on network resources. + +The ``--docker`` option runs tests in a container using either Docker or Podman. + +.. note:: + + If both Docker and Podman are installed, Docker will be used. + To override this, set the environment variable ``ANSIBLE_TEST_PREFER_PODMAN`` to any non-empty value. + +Choosing a container +^^^^^^^^^^^^^^^^^^^^ + +Without an additional argument, the ``--docker`` option uses the ``default`` container. +To use another container, specify it immediately after the ``--docker`` option. + +.. note:: + + The ``default`` container is recommended for all sanity and unit tests. + +To see the list of supported containers, use the ``--help`` option with the ``ansible-test`` command you want to use. + +.. note:: + + The list of available containers is dependent on the ``ansible-test`` command you are using. + +You can also specify your own container. +When doing so, you will need to indicate the Python version in the container with the ``--python`` option. + +Custom containers +""""""""""""""""" + +When building custom containers, keep in mind the following requirements: + +* The ``USER`` should be ``root``. +* Use an ``init`` process, such as ``systemd``. +* Include ``sshd`` and accept connections on the default port of ``22``. +* Include a POSIX compatible ``sh`` shell which can be found on ``PATH``. +* Include a ``sleep`` utility which runs as a subprocess. +* Include a supported version of Python. +* Avoid using the ``VOLUME`` statement. + +Docker and SELinux +^^^^^^^^^^^^^^^^^^ + +Using Docker on a host with SELinux may require setting the system in permissive mode. +Consider using Podman instead. + +Docker Desktop with WSL2 +^^^^^^^^^^^^^^^^^^^^^^^^ + +These instructions explain how to use ``ansible-test`` with WSL2 and Docker Desktop *without* ``systemd`` support. -Remote ------- +.. note:: -The ``--remote`` option runs tests in a cloud hosted environment. -An API key is required to use this feature. + If your WSL2 environment includes ``systemd`` support, these steps are not required. - Recommended for integration tests. +Configuration requirements +"""""""""""""""""""""""""" -See the `list of supported platforms and versions `_ for additional details. +1. Open Docker Desktop and go to the **Settings** screen. +2. On the the **General** tab: -Environment Variables ---------------------- + a. Uncheck the **Start Docker Desktop when you log in** checkbox. + b. Check the **Use the WSL 2 based engine** checkbox. + +3. On the **Resources** tab under the **WSL Integration** section: + + a. Enable distros you want to use under the **Enable integration with additional distros** section. + +4. Click **Apply and restart** if changes were made. + +Setup instructions +"""""""""""""""""" + +.. note:: + + If all WSL instances have been stopped, these changes will need to be re-applied. + +1. Verify Docker Desktop is properly configured (see :ref:`Configuration requirements`). +2. Quit Docker Desktop if it is running: + + a. Right click the **Docker Desktop** taskbar icon. + b. Click the **Quit Docker Desktop** option. + +3. Stop any running WSL instances with the command: + + .. code-block:: shell + + wsl --shutdown + +4. Verify all WSL instances have stopped with the command: + + .. code-block:: shell + + wsl -l -v + +5. Start a WSL instance and perform the following steps as ``root``: + + a. Verify the ``systemd`` subsystem is not registered: + + a. Check for the ``systemd`` cgroup hierarchy with the following command: + + .. code-block:: shell + + grep systemd /proc/self/cgroup + + b. If any matches are found, re-check the :ref:`Configuration requirements` and follow the + :ref:`Setup instructions` again. + + b. Mount the ``systemd`` cgroup hierarchy with the following commands: + + .. code-block:: shell + + mkdir /sys/fs/cgroup/systemd + mount cgroup -t cgroup /sys/fs/cgroup/systemd -o none,name=systemd,xattr + +6. Start Docker Desktop. + +You should now be able to use ``ansible-test`` with the ``--docker`` option. + +Linux cgroup configuration +^^^^^^^^^^^^^^^^^^^^^^^^^^ + +.. note:: + + These changes will need to be re-applied each time the container host is booted. + +For certain container hosts and container combinations, additional setup on the container host may be required. +In these situations ``ansible-test`` will report an error and provide additional instructions to run as ``root``: + +.. code-block:: shell + + mkdir /sys/fs/cgroup/systemd + mount cgroup -t cgroup /sys/fs/cgroup/systemd -o none,name=systemd,xattr + +If you are using rootless Podman, an additional command must be run, also as ``root``. +Make sure to substitute your user and group for ``{user}`` and ``{group}`` respectively: + +.. code-block:: shell + + chown -R {user}:{group} /sys/fs/cgroup/systemd + +Podman +"""""" + +When using Podman, you may need to stop existing Podman processes after following the :ref:`Linux cgroup configuration` +instructions. Otherwise Podman may be unable to see the new mount point. + +You can check to see if Podman is running by looking for ``podman`` and ``catatonit`` processes. + +Remote virtual machines +----------------------- + +Remote virtual machines are recommended for running integration tests not suitable for execution in containers. + +The ``--remote`` option runs tests in a cloud hosted ephemeral virtual machine. + +.. note:: + + An API key is required to use this feature, unless running under an approved Azure Pipelines organization. + +To see the list of supported systems, use the ``--help`` option with the ``ansible-test`` command you want to use. + +.. note:: + + The list of available systems is dependent on the ``ansible-test`` command you are using. + +Python virtual environments +--------------------------- + +Python virtual environments provide a simple way to achieve isolation from the system and user Python environments. +They are recommended for unit and integration tests when the ``--docker`` and ``--remote`` options cannot be used. + +The ``--venv`` option runs tests in a virtual environment managed by ``ansible-test``. +Requirements are automatically installed before tests are run. + +Composite environment arguments +------------------------------- + +The environment arguments covered in this document are sufficient for most use cases. +However, some scenarios may require the additional flexibility offered by composite environment arguments. + +The ``--controller`` and ``--target`` options are alternatives to the ``--docker``, ``--remote`` and ``--venv`` options. + +.. note:: + + When using the ``shell`` command, the ``--target`` option is replaced by three platform specific options. + +Add the ``--help`` option to your ``ansible-test`` command to learn more about the composite environment arguments. + +Additional Requirements +======================= + +Some ``ansible-test`` commands have additional requirements. +You can use the ``--requirements`` option to automatically install them. + +.. note:: + + When using a test environment managed by ``ansible-test`` the ``--requirements`` option is usually unnecessary. + +Environment variables +===================== When using environment variables to manipulate tests there some limitations to keep in mind. Environment variables are: @@ -51,16 +336,15 @@ When using environment variables to manipulate tests there some limitations to k and the tests executed. This is useful for debugging tests inside a container by following the :ref:`Debugging AnsibleModule-based modules ` instructions. -Interactive Shell +Interactive shell ================= Use the ``ansible-test shell`` command to get an interactive shell in the same environment used to run tests. Examples: * ``ansible-test shell --docker`` - Open a shell in the default docker container. -* ``ansible-test shell --venv --python 3.6`` - Open a shell in a Python 3.6 virtual environment. - +* ``ansible-test shell --venv --python 3.10`` - Open a shell in a Python 3.10 virtual environment. -Code Coverage +Code coverage ============= Code coverage reports make it easy to identify untested code for which more tests should @@ -72,22 +356,17 @@ aren't using the ``--venv`` or ``--docker`` options which create an isolated pyt environment then you may have to use the ``--requirements`` option to ensure that the correct version of the coverage module is installed: -.. code-block:: shell-session +.. code-block:: shell ansible-test coverage erase ansible-test units --coverage apt ansible-test integration --coverage aws_lambda ansible-test coverage html - Reports can be generated in several different formats: * ``ansible-test coverage report`` - Console report. * ``ansible-test coverage html`` - HTML report. * ``ansible-test coverage xml`` - XML report. -To clear data between test runs, use the ``ansible-test coverage erase`` command. For a full list of features see the online help: - -.. code-block:: shell-session - - ansible-test coverage --help +To clear data between test runs, use the ``ansible-test coverage erase`` command. diff --git a/test/integration/targets/ansible-test-container/aliases b/test/integration/targets/ansible-test-container/aliases new file mode 100644 index 0000000000..65a0509311 --- /dev/null +++ b/test/integration/targets/ansible-test-container/aliases @@ -0,0 +1,5 @@ +shippable/posix/group6 +context/controller +needs/root +destructive +retry/never # tests on some platforms run too long to make retries useful diff --git a/test/integration/targets/ansible-test-container/runme.py b/test/integration/targets/ansible-test-container/runme.py new file mode 100755 index 0000000000..ca0ce86f93 --- /dev/null +++ b/test/integration/targets/ansible-test-container/runme.py @@ -0,0 +1,1053 @@ +#!/usr/bin/env python +"""Test suite used to verify ansible-test is able to run its containers on various container hosts.""" + +from __future__ import annotations + +import abc +import dataclasses +import datetime +import errno +import functools +import json +import os +import pathlib +import pwd +import re +import secrets +import shlex +import shutil +import signal +import subprocess +import sys +import tempfile +import time +import typing as t + +UNPRIVILEGED_USER_NAME = 'ansible-test' +CGROUP_SYSTEMD = pathlib.Path('/sys/fs/cgroup/systemd') +LOG_PATH = pathlib.Path('/tmp/results') + +# The value of /proc/*/loginuid when it is not set. +# It is a reserved UID, which is the maximum 32-bit unsigned integer value. +# See: https://access.redhat.com/solutions/25404 +LOGINUID_NOT_SET = 4294967295 + +UID = os.getuid() + +try: + LOGINUID = int(pathlib.Path('/proc/self/loginuid').read_text()) + LOGINUID_MISMATCH = LOGINUID != LOGINUID_NOT_SET and LOGINUID != UID +except FileNotFoundError: + LOGINUID = None + LOGINUID_MISMATCH = False + + +def main() -> None: + """Main program entry point.""" + display.section('Startup check') + + try: + bootstrap_type = pathlib.Path('/etc/ansible-test.bootstrap').read_text().strip() + except FileNotFoundError: + bootstrap_type = 'undefined' + + display.info(f'Bootstrap type: {bootstrap_type}') + + if bootstrap_type != 'remote': + display.warning('Skipping destructive test on system which is not an ansible-test remote provisioned instance.') + return + + display.info(f'UID: {UID} / {LOGINUID}') + + if UID != 0: + raise Exception('This test must be run as root.') + + if not LOGINUID_MISMATCH: + if LOGINUID is None: + display.warning('Tests involving loginuid mismatch will be skipped on this host since it does not have audit support.') + elif LOGINUID == LOGINUID_NOT_SET: + display.warning('Tests involving loginuid mismatch will be skipped on this host since it is not set.') + elif LOGINUID == 0: + raise Exception('Use sudo, su, etc. as a non-root user to become root before running this test.') + else: + raise Exception() + + display.section(f'Bootstrapping {os_release}') + + bootstrapper = Bootstrapper.init() + bootstrapper.run() + + result_dir = LOG_PATH + + if result_dir.exists(): + shutil.rmtree(result_dir) + + result_dir.mkdir() + result_dir.chmod(0o777) + + scenarios = get_test_scenarios() + results = [run_test(scenario) for scenario in scenarios] + error_total = 0 + + for name in sorted(result_dir.glob('*.log')): + lines = name.read_text().strip().splitlines() + error_count = len([line for line in lines if line.startswith('FAIL: ')]) + error_total += error_count + + display.section(f'Log ({error_count=}/{len(lines)}): {name.name}') + + for line in lines: + if line.startswith('FAIL: '): + display.show(line, display.RED) + else: + display.show(line) + + error_count = len([result for result in results if result.message]) + error_total += error_count + + duration = datetime.timedelta(seconds=int(sum(result.duration.total_seconds() for result in results))) + + display.section(f'Test Results ({error_count=}/{len(results)}) [{duration}]') + + for result in results: + notes = f' ' if result.cleanup else '' + + if result.cgroup_dirs: + notes += f' ' + + notes += f' [{result.duration}]' + + if result.message: + display.show(f'FAIL: {result.scenario} {result.message}{notes}', display.RED) + elif result.duration.total_seconds() >= 90: + display.show(f'SLOW: {result.scenario}{notes}', display.YELLOW) + else: + display.show(f'PASS: {result.scenario}{notes}') + + if error_total: + sys.exit(1) + + +def get_test_scenarios() -> list[TestScenario]: + """Generate and return a list of test scenarios.""" + + supported_engines = ('docker', 'podman') + available_engines = [engine for engine in supported_engines if shutil.which(engine)] + + if not available_engines: + raise ApplicationError(f'No supported container engines found: {", ".join(supported_engines)}') + + completion_lines = pathlib.Path(os.environ['PYTHONPATH'], '../test/lib/ansible_test/_data/completion/docker.txt').read_text().splitlines() + + # TODO: consider including testing for the collection default image + entries = {name: value for name, value in (parse_completion_entry(line) for line in completion_lines) if name != 'default'} + + unprivileged_user = User.get(UNPRIVILEGED_USER_NAME) + + scenarios: list[TestScenario] = [] + + for container_name, settings in entries.items(): + image = settings['image'] + cgroup = settings.get('cgroup', 'v1-v2') + + for engine in available_engines: + # TODO: figure out how to get tests passing using docker without disabling selinux + disable_selinux = os_release.id == 'fedora' and engine == 'docker' and cgroup != 'none' + expose_cgroup_v1 = cgroup == 'v1-only' and get_docker_info(engine).cgroup_version != 1 + + if cgroup != 'none' and get_docker_info(engine).cgroup_version == 1 and not have_cgroup_systemd(): + expose_cgroup_v1 = True # the host uses cgroup v1 but there is no systemd cgroup and the container requires cgroup support + + user_scenarios = [ + # TODO: test rootless docker + UserScenario(ssh=unprivileged_user), + ] + + if engine == 'podman': + user_scenarios.append(UserScenario(ssh=ROOT_USER)) + + # TODO: test podman remote on Alpine and Ubuntu hosts + # TODO: combine remote with ssh using different unprivileged users + if os_release.id not in ('alpine', 'ubuntu'): + user_scenarios.append(UserScenario(remote=unprivileged_user)) + + if LOGINUID_MISMATCH: + user_scenarios.append(UserScenario()) + + for user_scenario in user_scenarios: + scenarios.append( + TestScenario( + user_scenario=user_scenario, + engine=engine, + container_name=container_name, + image=image, + disable_selinux=disable_selinux, + expose_cgroup_v1=expose_cgroup_v1, + ) + ) + + return scenarios + + +def run_test(scenario: TestScenario) -> TestResult: + """Run a test scenario and return the test results.""" + display.section(f'Testing {scenario} Started') + + start = time.monotonic() + + integration = ['ansible-test', 'integration', 'split'] + integration_options = ['--target', f'docker:{scenario.container_name}', '--color', '--truncate', '0', '-v', '--dev-probe-cgroups', str(LOG_PATH), + '--dev-systemd-debug'] + + commands = [ + [*integration, *integration_options], + # For the split test we'll use alpine3 as the controller. There are two reasons for this: + # 1) It doesn't require the cgroup v1 hack, so we can test a target that doesn't need that. + # 2) It doesn't require disabling selinux, so we can test a target that doesn't need that. + [*integration, '--controller', 'docker:alpine3', *integration_options], + ] + + common_env: dict[str, str] = {} + test_env: dict[str, str] = {} + + if scenario.engine == 'podman': + if scenario.user_scenario.remote: + common_env.update( + # Podman 4.3.0 has a regression which requires a port for remote connections to work. + # See: https://github.com/containers/podman/issues/16509 + CONTAINER_HOST=f'ssh://{scenario.user_scenario.remote.name}@localhost:22' + f'/run/user/{scenario.user_scenario.remote.pwnam.pw_uid}/podman/podman.sock', + CONTAINER_SSHKEY=str(pathlib.Path('~/.ssh/id_rsa').expanduser()), # TODO: add support for ssh + remote when the ssh user is not root + ) + + test_env.update(ANSIBLE_TEST_PREFER_PODMAN='1') + + test_env.update(common_env) + + if scenario.user_scenario.ssh: + client_become_cmd = ['ssh', f'{scenario.user_scenario.ssh.name}@localhost'] + test_commands = [client_become_cmd + [f'cd ~/ansible; {format_env(test_env)}{sys.executable} bin/{shlex.join(command)}'] for command in commands] + else: + client_become_cmd = ['sh', '-c'] + test_commands = [client_become_cmd + [f'{format_env(test_env)}{shlex.join(command)}'] for command in commands] + + prime_storage_command = [] + + if scenario.engine == 'podman' and scenario.user_scenario.actual.name == UNPRIVILEGED_USER_NAME: + # When testing podman we need to make sure that the overlay filesystem is used instead of vfs. + # Using the vfs filesystem will result in running out of disk space during the tests. + # To change the filesystem used, the existing storage directory must be removed before "priming" the storage database. + # + # Without this change the following message may be displayed: + # + # User-selected graph driver "overlay" overwritten by graph driver "vfs" from database - delete libpod local files to resolve + # + # However, with this change it may be replaced with the following message: + # + # User-selected graph driver "vfs" overwritten by graph driver "overlay" from database - delete libpod local files to resolve + + actual_become_cmd = ['ssh', f'{scenario.user_scenario.actual.name}@localhost'] + prime_storage_command = actual_become_cmd + prepare_prime_podman_storage() + + message = '' + + if scenario.expose_cgroup_v1: + prepare_cgroup_systemd(scenario.user_scenario.actual.name, scenario.engine) + + try: + if prime_storage_command: + retry_command(lambda: run_command(*prime_storage_command)) + + if scenario.disable_selinux: + run_command('setenforce', 'permissive') + + for test_command in test_commands: + retry_command(lambda: run_command(*test_command)) + except SubprocessError as ex: + message = str(ex) + display.error(f'{scenario} {message}') + finally: + if scenario.disable_selinux: + run_command('setenforce', 'enforcing') + + if scenario.expose_cgroup_v1: + dirs = remove_cgroup_systemd() + else: + dirs = list_group_systemd() + + cleanup_command = [scenario.engine, 'rmi', '-f', scenario.image] + + try: + retry_command(lambda: run_command(*client_become_cmd + [f'{format_env(common_env)}{shlex.join(cleanup_command)}'])) + except SubprocessError as ex: + display.error(str(ex)) + + cleanup = cleanup_podman() if scenario.engine == 'podman' else tuple() + + finish = time.monotonic() + duration = datetime.timedelta(seconds=int(finish - start)) + + display.section(f'Testing {scenario} Completed in {duration}') + + return TestResult( + scenario=scenario, + message=message, + cleanup=cleanup, + duration=duration, + cgroup_dirs=tuple(str(path) for path in dirs), + ) + + +def prepare_prime_podman_storage() -> list[str]: + """Partially prime podman storage and return a command to complete the remainder.""" + prime_storage_command = ['rm -rf ~/.local/share/containers; STORAGE_DRIVER=overlay podman pull quay.io/bedrock/alpine:3.16.2'] + + test_containers = pathlib.Path(f'~{UNPRIVILEGED_USER_NAME}/.local/share/containers').expanduser() + + if test_containers.is_dir(): + # First remove the directory as root, since the user may not have permissions on all the files. + # The directory will be removed again after login, before initializing the database. + rmtree(test_containers) + + return prime_storage_command + + +def cleanup_podman() -> tuple[str, ...]: + """Cleanup podman processes and files on disk.""" + cleanup = [] + + for remaining in range(3, -1, -1): + processes = [(int(item[0]), item[1]) for item in + [item.split(maxsplit=1) for item in run_command('ps', '-A', '-o', 'pid,comm', capture=True).stdout.splitlines()] + if pathlib.Path(item[1].split()[0]).name in ('catatonit', 'podman', 'conmon')] + + if not processes: + break + + for pid, name in processes: + display.info(f'Killing "{name}" ({pid}) ...') + + try: + os.kill(pid, signal.SIGTERM if remaining > 1 else signal.SIGKILL) + except ProcessLookupError: + pass + + cleanup.append(name) + + time.sleep(1) + else: + raise Exception('failed to kill all matching processes') + + uid = pwd.getpwnam(UNPRIVILEGED_USER_NAME).pw_uid + + container_tmp = pathlib.Path(f'/tmp/containers-user-{uid}') + podman_tmp = pathlib.Path(f'/tmp/podman-run-{uid}') + + user_config = pathlib.Path(f'~{UNPRIVILEGED_USER_NAME}/.config').expanduser() + user_local = pathlib.Path(f'~{UNPRIVILEGED_USER_NAME}/.local').expanduser() + + if container_tmp.is_dir(): + rmtree(container_tmp) + + if podman_tmp.is_dir(): + rmtree(podman_tmp) + + if user_config.is_dir(): + rmtree(user_config) + + if user_local.is_dir(): + rmtree(user_local) + + return tuple(sorted(set(cleanup))) + + +def have_cgroup_systemd() -> bool: + """Return True if the container host has a systemd cgroup.""" + return pathlib.Path(CGROUP_SYSTEMD).is_dir() + + +def prepare_cgroup_systemd(username: str, engine: str) -> None: + """Prepare the systemd cgroup.""" + CGROUP_SYSTEMD.mkdir() + + run_command('mount', 'cgroup', '-t', 'cgroup', str(CGROUP_SYSTEMD), '-o', 'none,name=systemd,xattr', capture=True) + + if engine == 'podman': + run_command('chown', '-R', f'{username}:{username}', str(CGROUP_SYSTEMD)) + + run_command('find', str(CGROUP_SYSTEMD), '-type', 'd', '-exec', 'ls', '-l', '{}', ';') + + +def list_group_systemd() -> list[pathlib.Path]: + """List the systemd cgroup.""" + dirs = set() + + for dirpath, dirnames, filenames in os.walk(CGROUP_SYSTEMD, topdown=False): + for dirname in dirnames: + target_path = pathlib.Path(dirpath, dirname) + display.info(f'dir: {target_path}') + dirs.add(target_path) + + return sorted(dirs) + + +def remove_cgroup_systemd() -> list[pathlib.Path]: + """Remove the systemd cgroup.""" + dirs = set() + + for sleep_seconds in range(1, 10): + try: + for dirpath, dirnames, filenames in os.walk(CGROUP_SYSTEMD, topdown=False): + for dirname in dirnames: + target_path = pathlib.Path(dirpath, dirname) + display.info(f'rmdir: {target_path}') + dirs.add(target_path) + target_path.rmdir() + except OSError as ex: + if ex.errno != errno.EBUSY: + raise + + error = str(ex) + else: + break + + display.warning(f'{error} -- sleeping for {sleep_seconds} second(s) before trying again ...') # pylint: disable=used-before-assignment + + time.sleep(sleep_seconds) + + time.sleep(1) # allow time for cgroups to be fully removed before unmounting + + run_command('umount', str(CGROUP_SYSTEMD)) + + CGROUP_SYSTEMD.rmdir() + + time.sleep(1) # allow time for cgroup hierarchy to be removed after unmounting + + cgroup = pathlib.Path('/proc/self/cgroup').read_text() + + if 'systemd' in cgroup: + raise Exception('systemd hierarchy detected') + + return sorted(dirs) + + +def rmtree(path: pathlib.Path) -> None: + """Wrapper around shutil.rmtree with additional error handling.""" + for retries in range(10, -1, -1): + try: + display.info(f'rmtree: {path} ({retries} attempts remaining) ... ') + shutil.rmtree(path) + except Exception: + if not path.exists(): + display.info(f'rmtree: {path} (not found)') + return + + if not path.is_dir(): + display.info(f'rmtree: {path} (not a directory)') + return + + if retries: + continue + + raise + else: + display.info(f'rmtree: {path} (done)') + return + + +def format_env(env: dict[str, str]) -> str: + """Format an env dict for injection into a shell command and return the resulting string.""" + if env: + return ' '.join(f'{shlex.quote(key)}={shlex.quote(value)}' for key, value in env.items()) + ' ' + + return '' + + +class DockerInfo: + """The results of `docker info` for the container runtime.""" + + def __init__(self, data: dict[str, t.Any]) -> None: + self.data = data + + @property + def cgroup_version(self) -> int: + """The cgroup version of the container host.""" + data = self.data + host = data.get('host') + + if host: + version = int(host['cgroupVersion'].lstrip('v')) # podman + else: + version = int(data['CgroupVersion']) # docker + + return version + + +@functools.lru_cache +def get_docker_info(engine: str) -> DockerInfo: + """Return info for the current container runtime. The results are cached.""" + return DockerInfo(json.loads(run_command(engine, 'info', '--format', '{{ json . }}', capture=True).stdout)) + + +@dataclasses.dataclass(frozen=True) +class User: + name: str + pwnam: pwd.struct_passwd + + @classmethod + def get(cls, name: str) -> User: + return User( + name=name, + pwnam=pwd.getpwnam(name), + ) + + +@dataclasses.dataclass(frozen=True) +class UserScenario: + ssh: User = None + remote: User = None + + @property + def actual(self) -> User: + return self.remote or self.ssh or ROOT_USER + + +@dataclasses.dataclass(frozen=True) +class TestScenario: + user_scenario: UserScenario + engine: str + container_name: str + image: str + disable_selinux: bool + expose_cgroup_v1: bool + + @property + def tags(self) -> tuple[str, ...]: + tags = [] + + if self.user_scenario.ssh: + tags.append(f'ssh: {self.user_scenario.ssh.name}') + + if self.user_scenario.remote: + tags.append(f'remote: {self.user_scenario.remote.name}') + + if self.disable_selinux: + tags.append('selinux: permissive') + + if self.expose_cgroup_v1: + tags.append('cgroup: v1') + + return tuple(tags) + + @property + def tag_label(self) -> str: + return ' '.join(f'[{tag}]' for tag in self.tags) + + def __str__(self): + return f'[{self.container_name}] ({self.engine}) {self.tag_label}'.strip() + + +@dataclasses.dataclass(frozen=True) +class TestResult: + scenario: TestScenario + message: str + cleanup: tuple[str, ...] + duration: datetime.timedelta + cgroup_dirs: tuple[str, ...] + + +def parse_completion_entry(value: str) -> tuple[str, dict[str, str]]: + """Parse the given completion entry, returning the entry name and a dictionary of key/value settings.""" + values = value.split() + + name = values[0] + data = {kvp[0]: kvp[1] if len(kvp) > 1 else '' for kvp in [item.split('=', 1) for item in values[1:]]} + + return name, data + + +@dataclasses.dataclass(frozen=True) +class SubprocessResult: + """Result from execution of a subprocess.""" + + command: list[str] + stdout: str + stderr: str + status: int + + +class ApplicationError(Exception): + """An application error.""" + + def __init__(self, message: str) -> None: + self.message = message + + super().__init__(message) + + +class SubprocessError(ApplicationError): + """An error from executing a subprocess.""" + + def __init__(self, result: SubprocessResult) -> None: + self.result = result + + message = f'Command `{shlex.join(result.command)}` exited with status: {result.status}' + + stdout = (result.stdout or '').strip() + stderr = (result.stderr or '').strip() + + if stdout: + message += f'\n>>> Standard Output\n{stdout}' + + if stderr: + message += f'\n>>> Standard Error\n{stderr}' + + super().__init__(message) + + +class ProgramNotFoundError(ApplicationError): + """A required program was not found.""" + + def __init__(self, name: str) -> None: + self.name = name + + super().__init__(f'Missing program: {name}') + + +class Display: + """Display interface for sending output to the console.""" + + CLEAR = '\033[0m' + RED = '\033[31m' + GREEN = '\033[32m' + YELLOW = '\033[33m' + BLUE = '\033[34m' + PURPLE = '\033[35m' + CYAN = '\033[36m' + + def __init__(self) -> None: + self.sensitive: set[str] = set() + + def section(self, message: str) -> None: + """Print a section message to the console.""" + self.show(f'==> {message}', color=self.BLUE) + + def subsection(self, message: str) -> None: + """Print a subsection message to the console.""" + self.show(f'--> {message}', color=self.CYAN) + + def fatal(self, message: str) -> None: + """Print a fatal message to the console.""" + self.show(f'FATAL: {message}', color=self.RED) + + def error(self, message: str) -> None: + """Print an error message to the console.""" + self.show(f'ERROR: {message}', color=self.RED) + + def warning(self, message: str) -> None: + """Print a warning message to the console.""" + self.show(f'WARNING: {message}', color=self.PURPLE) + + def info(self, message: str) -> None: + """Print an info message to the console.""" + self.show(f'INFO: {message}', color=self.YELLOW) + + def show(self, message: str, color: str | None = None) -> None: + """Print a message to the console.""" + for item in self.sensitive: + message = message.replace(item, '*' * len(item)) + + print(f'{color or self.CLEAR}{message}{self.CLEAR}', flush=True) + + +def run_module( + module: str, + args: dict[str, t.Any], +) -> SubprocessResult: + """Run the specified Ansible module and return the result.""" + playbook = f''' +- hosts: localhost + gather_facts: no + tasks: + - user: {json.dumps(args)} +''' + + with tempfile.NamedTemporaryFile() as playbook_file: + playbook_file.write(playbook.encode('utf8')) + playbook_file.flush() + + return run_command('ansible-playbook', '-v', playbook_file.name) + + +def retry_command(func: t.Callable[[], SubprocessResult], attempts: int = 3) -> SubprocessResult: + """Run the given command function up to the specified number of attempts when the failure is due to an SSH error.""" + for attempts_remaining in range(attempts - 1, -1, -1): + try: + return func() + except SubprocessError as ex: + if ex.result.command[0] == 'ssh' and ex.result.status == 255 and attempts_remaining: + # SSH connections on our Ubuntu 22.04 host sometimes fail for unknown reasons. + # This retry should allow the test suite to continue, maintaining CI stability. + # TODO: Figure out why local SSH connections sometimes fail during the test run. + display.warning('Command failed due to an SSH error. Waiting a few seconds before retrying.') + time.sleep(3) + continue + + raise + + +def run_command( + *command: str, + data: str | None = None, + stdin: int | t.IO[bytes] | None = None, + env: dict[str, str] | None = None, + capture: bool = False, +) -> SubprocessResult: + """Run the specified command and return the result.""" + stdin = subprocess.PIPE if data else stdin or subprocess.DEVNULL + stdout = subprocess.PIPE if capture else None + stderr = subprocess.PIPE if capture else None + + display.subsection(f'Run command: {shlex.join(command)}') + + try: + with subprocess.Popen(args=command, stdin=stdin, stdout=stdout, stderr=stderr, env=env, text=True) as process: + process_stdout, process_stderr = process.communicate(data) + process_status = process.returncode + except FileNotFoundError: + raise ProgramNotFoundError(command[0]) from None + + result = SubprocessResult( + command=list(command), + stdout=process_stdout, + stderr=process_stderr, + status=process_status, + ) + + if process.returncode != 0: + raise SubprocessError(result) + + return result + + +class Bootstrapper(metaclass=abc.ABCMeta): + """Bootstrapper for remote instances.""" + + @classmethod + def install_podman(cls) -> bool: + """Return True if podman will be installed.""" + return False + + @classmethod + def install_docker(cls) -> bool: + """Return True if docker will be installed.""" + return False + + @classmethod + def usable(cls) -> bool: + """Return True if the bootstrapper can be used, otherwise False.""" + return False + + @classmethod + def init(cls) -> t.Type[Bootstrapper]: + """Return a bootstrapper type appropriate for the current system.""" + for bootstrapper in cls.__subclasses__(): + if bootstrapper.usable(): + return bootstrapper + + display.warning('No supported bootstrapper found.') + return Bootstrapper + + @classmethod + def run(cls) -> None: + """Run the bootstrapper.""" + cls.configure_root_user() + cls.configure_unprivileged_user() + cls.configure_source_trees() + cls.configure_ssh_keys() + cls.configure_podman_remote() + + @classmethod + def configure_root_user(cls) -> None: + """Configure the root user to run tests.""" + root_password_status = run_command('passwd', '--status', 'root', capture=True) + root_password_set = root_password_status.stdout.split()[1] + + if root_password_set not in ('P', 'PS'): + root_password = run_command('openssl', 'passwd', '-5', '-stdin', data=secrets.token_hex(8), capture=True).stdout.strip() + + run_module( + 'user', + dict( + user='root', + password=root_password, + ), + ) + + @classmethod + def configure_unprivileged_user(cls) -> None: + """Configure the unprivileged user to run tests.""" + unprivileged_password = run_command('openssl', 'passwd', '-5', '-stdin', data=secrets.token_hex(8), capture=True).stdout.strip() + + run_module( + 'user', + dict( + user=UNPRIVILEGED_USER_NAME, + password=unprivileged_password, + groups=['docker'] if cls.install_docker() else [], + append=True, + ), + ) + + if os_release.id == 'alpine': + # Most distros handle this automatically, but not Alpine. + # See: https://www.redhat.com/sysadmin/rootless-podman + start = 165535 + end = start + 65535 + id_range = f'{start}-{end}' + + run_command( + 'usermod', + '--add-subuids', + id_range, + '--add-subgids', + id_range, + UNPRIVILEGED_USER_NAME, + ) + + @classmethod + def configure_source_trees(cls): + """Configure the source trees needed to run tests for both root and the unprivileged user.""" + current_ansible = pathlib.Path(os.environ['PYTHONPATH']).parent + + root_ansible = pathlib.Path('~').expanduser() / 'ansible' + test_ansible = pathlib.Path(f'~{UNPRIVILEGED_USER_NAME}').expanduser() / 'ansible' + + if current_ansible != root_ansible: + display.info(f'copying {current_ansible} -> {root_ansible} ...') + rmtree(root_ansible) + shutil.copytree(current_ansible, root_ansible) + run_command('chown', '-R', 'root:root', str(root_ansible)) + + display.info(f'copying {current_ansible} -> {test_ansible} ...') + rmtree(test_ansible) + shutil.copytree(current_ansible, test_ansible) + run_command('chown', '-R', f'{UNPRIVILEGED_USER_NAME}:{UNPRIVILEGED_USER_NAME}', str(test_ansible)) + + paths = [pathlib.Path(test_ansible)] + + for root, dir_names, file_names in os.walk(test_ansible): + paths.extend(pathlib.Path(root, dir_name) for dir_name in dir_names) + paths.extend(pathlib.Path(root, file_name) for file_name in file_names) + + user = pwd.getpwnam(UNPRIVILEGED_USER_NAME) + uid = user.pw_uid + gid = user.pw_gid + + for path in paths: + os.chown(path, uid, gid) + + @classmethod + def configure_ssh_keys(cls) -> None: + """Configure SSH keys needed to run tests.""" + user = pwd.getpwnam(UNPRIVILEGED_USER_NAME) + uid = user.pw_uid + gid = user.pw_gid + + current_rsa_pub = pathlib.Path('~/.ssh/id_rsa.pub').expanduser() + + test_authorized_keys = pathlib.Path(f'~{UNPRIVILEGED_USER_NAME}/.ssh/authorized_keys').expanduser() + + test_authorized_keys.parent.mkdir(mode=0o755, parents=True, exist_ok=True) + os.chown(test_authorized_keys.parent, uid, gid) + + shutil.copyfile(current_rsa_pub, test_authorized_keys) + os.chown(test_authorized_keys, uid, gid) + test_authorized_keys.chmod(mode=0o644) + + @classmethod + def configure_podman_remote(cls) -> None: + """Configure podman remote support.""" + # TODO: figure out how to support remote podman without systemd (Alpine) + # TODO: figure out how to support remote podman on Ubuntu + if os_release.id in ('alpine', 'ubuntu'): + return + + # Support podman remote on any host with systemd available. + retry_command(lambda: run_command('ssh', f'{UNPRIVILEGED_USER_NAME}@localhost', 'systemctl', '--user', 'enable', '--now', 'podman.socket')) + run_command('loginctl', 'enable-linger', UNPRIVILEGED_USER_NAME) + + +class DnfBootstrapper(Bootstrapper): + """Bootstrapper for dnf based systems.""" + + @classmethod + def install_podman(cls) -> bool: + """Return True if podman will be installed.""" + return True + + @classmethod + def install_docker(cls) -> bool: + """Return True if docker will be installed.""" + return os_release.id != 'rhel' + + @classmethod + def usable(cls) -> bool: + """Return True if the bootstrapper can be used, otherwise False.""" + return bool(shutil.which('dnf')) + + @classmethod + def run(cls) -> None: + """Run the bootstrapper.""" + # NOTE: Install crun to make it available to podman, otherwise installing moby-engine can cause podman to use runc instead. + packages = ['podman', 'crun'] + + if cls.install_docker(): + packages.append('moby-engine') + + if os_release.id == 'fedora' and os_release.version_id == '36': + # In Fedora 36 the current version of netavark, 1.2.0, causes TCP connect to hang between rootfull containers. + # The previously tested version, 1.1.0, did not have this issue. + # Unfortunately, with the release of 1.2.0 the 1.1.0 package was removed from the repositories. + # Thankfully the 1.0.2 version is available and also works, so we'll use that here until a fixed version is available. + # See: https://github.com/containers/netavark/issues/491 + packages.append('netavark-1.0.2') + + if os_release.id == 'rhel': + # As of the release of RHEL 9.1, installing podman on RHEL 9.0 results in a non-fatal error at install time: + # + # libsemanage.semanage_pipe_data: Child process /usr/libexec/selinux/hll/pp failed with code: 255. (No such file or directory). + # container: libsepol.policydb_read: policydb module version 21 does not match my version range 4-20 + # container: libsepol.sepol_module_package_read: invalid module in module package (at section 0) + # container: Failed to read policy package + # libsemanage.semanage_direct_commit: Failed to compile hll files into cil files. + # (No such file or directory). + # /usr/sbin/semodule: Failed! + # + # Unfortunately this is then fatal when running podman, resulting in no error message and a 127 return code. + # The solution is to update the policycoreutils package *before* installing podman. + # + # NOTE: This work-around can probably be removed once we're testing on RHEL 9.1, as the updated packages should already be installed. + # Unfortunately at this time there is no RHEL 9.1 AMI available (other than the Beta release). + + run_command('dnf', 'update', '-y', 'policycoreutils') + + run_command('dnf', 'install', '-y', *packages) + + if cls.install_docker(): + run_command('systemctl', 'start', 'docker') + + if os_release.id == 'rhel' and os_release.version_id.startswith('8.'): + # RHEL 8 defaults to using runc instead of crun. + # Unfortunately runc seems to have issues with podman remote. + # Specifically, it tends to cause conmon to burn CPU until it reaches the specified exit delay. + # So we'll just change the system default to crun instead. + # Unfortunately we can't do this with the `--runtime` option since that doesn't work with podman remote. + + conf = pathlib.Path('/usr/share/containers/containers.conf').read_text() + + conf = re.sub('^runtime .*', 'runtime = "crun"', conf, flags=re.MULTILINE) + + pathlib.Path('/etc/containers/containers.conf').write_text(conf) + + super().run() + + +class AptBootstrapper(Bootstrapper): + """Bootstrapper for apt based systems.""" + + @classmethod + def install_podman(cls) -> bool: + """Return True if podman will be installed.""" + return not (os_release.id == 'ubuntu' and os_release.version_id == '20.04') + + @classmethod + def install_docker(cls) -> bool: + """Return True if docker will be installed.""" + return True + + @classmethod + def usable(cls) -> bool: + """Return True if the bootstrapper can be used, otherwise False.""" + return bool(shutil.which('apt-get')) + + @classmethod + def run(cls) -> None: + """Run the bootstrapper.""" + apt_env = os.environ.copy() + apt_env.update( + DEBIAN_FRONTEND='noninteractive', + ) + + packages = ['docker.io'] + + if cls.install_podman(): + # NOTE: Install crun to make it available to podman, otherwise installing docker.io can cause podman to use runc instead. + # Using podman rootless requires the `newuidmap` and `slirp4netns` commands. + packages.extend(('podman', 'crun', 'uidmap', 'slirp4netns')) + + run_command('apt-get', 'install', *packages, '-y', '--no-install-recommends', env=apt_env) + + super().run() + + +class ApkBootstrapper(Bootstrapper): + """Bootstrapper for apk based systems.""" + + @classmethod + def install_podman(cls) -> bool: + """Return True if podman will be installed.""" + return True + + @classmethod + def install_docker(cls) -> bool: + """Return True if docker will be installed.""" + return True + + @classmethod + def usable(cls) -> bool: + """Return True if the bootstrapper can be used, otherwise False.""" + return bool(shutil.which('apk')) + + @classmethod + def run(cls) -> None: + """Run the bootstrapper.""" + # The `openssl` package is used to generate hashed passwords. + packages = ['docker', 'podman', 'openssl'] + + run_command('apk', 'add', *packages) + run_command('service', 'docker', 'start') + run_command('modprobe', 'tun') + + super().run() + + +@dataclasses.dataclass(frozen=True) +class OsRelease: + """Operating system identification.""" + + id: str + version_id: str + + @staticmethod + def init() -> OsRelease: + """Detect the current OS release and return the result.""" + lines = run_command('sh', '-c', '. /etc/os-release && echo $ID && echo $VERSION_ID', capture=True).stdout.splitlines() + + result = OsRelease( + id=lines[0], + version_id=lines[1], + ) + + display.show(f'Detected OS "{result.id}" version "{result.version_id}".') + + return result + + +display = Display() +os_release = OsRelease.init() + +ROOT_USER = User.get('root') + +if __name__ == '__main__': + main() diff --git a/test/integration/targets/ansible-test-container/runme.sh b/test/integration/targets/ansible-test-container/runme.sh new file mode 100755 index 0000000000..56fd669031 --- /dev/null +++ b/test/integration/targets/ansible-test-container/runme.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +set -eu + +./runme.py diff --git a/test/lib/ansible_test/_data/completion/docker.txt b/test/lib/ansible_test/_data/completion/docker.txt index 93a9dea237..e1bdda86f5 100644 --- a/test/lib/ansible_test/_data/completion/docker.txt +++ b/test/lib/ansible_test/_data/completion/docker.txt @@ -1,11 +1,11 @@ -base image=quay.io/ansible/base-test-container:2.2.0 python=3.10,2.7,3.5,3.6,3.7,3.8,3.9 seccomp=unconfined -default image=quay.io/ansible/default-test-container:5.9.0 python=3.10,2.7,3.5,3.6,3.7,3.8,3.9 seccomp=unconfined context=collection -default image=quay.io/ansible/ansible-core-test-container:5.9.0 python=3.10,2.7,3.5,3.6,3.7,3.8,3.9 seccomp=unconfined context=ansible-core -alpine3 image=quay.io/ansible/alpine3-test-container:3.3.0 python=3.9 -centos7 image=quay.io/ansible/centos7-test-container:3.1.0 python=2.7 seccomp=unconfined -fedora34 image=quay.io/ansible/fedora34-test-container:3.1.0 python=3.9 seccomp=unconfined -fedora35 image=quay.io/ansible/fedora35-test-container:3.2.0 python=3.10 seccomp=unconfined -opensuse15py2 image=quay.io/ansible/opensuse15py2-test-container:3.1.0 python=2.7 -opensuse15 image=quay.io/ansible/opensuse15-test-container:3.1.0 python=3.6 -ubuntu1804 image=quay.io/ansible/ubuntu1804-test-container:3.1.0 python=3.6 seccomp=unconfined -ubuntu2004 image=quay.io/ansible/ubuntu2004-test-container:3.1.0 python=3.8 seccomp=unconfined +base image=quay.io/ansible/base-test-container:2.2.1 python=3.10,2.7,3.5,3.6,3.7,3.8,3.9 +default image=quay.io/ansible/default-test-container:5.9.1 python=3.10,2.7,3.5,3.6,3.7,3.8,3.9 context=collection +default image=quay.io/ansible/ansible-core-test-container:5.9.1 python=3.10,2.7,3.5,3.6,3.7,3.8,3.9 context=ansible-core +alpine3 image=quay.io/ansible/alpine3-test-container:3.3.1 python=3.9 cgroup=none audit=none +centos7 image=quay.io/ansible/centos7-test-container:3.1.1 python=2.7 cgroup=v1-only +fedora34 image=quay.io/ansible/fedora34-test-container:3.1.1 python=3.9 +fedora35 image=quay.io/ansible/fedora35-test-container:3.2.1 python=3.10 +opensuse15py2 image=quay.io/ansible/opensuse15py2-test-container:3.1.1 python=2.7 +opensuse15 image=quay.io/ansible/opensuse15-test-container:3.1.1 python=3.6 +ubuntu1804 image=quay.io/ansible/ubuntu1804-test-container:3.1.1 python=3.6 +ubuntu2004 image=quay.io/ansible/ubuntu2004-test-container:3.1.1 python=3.8 diff --git a/test/lib/ansible_test/_data/completion/remote.txt b/test/lib/ansible_test/_data/completion/remote.txt index c07f2d1dbb..bd0a319b70 100644 --- a/test/lib/ansible_test/_data/completion/remote.txt +++ b/test/lib/ansible_test/_data/completion/remote.txt @@ -1,11 +1,16 @@ -freebsd/12.3 python=3.8 python_dir=/usr/local/bin provider=aws arch=x86_64 -freebsd/13.0 python=3.7,2.7,3.8,3.9 python_dir=/usr/local/bin provider=aws arch=x86_64 -freebsd python_dir=/usr/local/bin provider=aws arch=x86_64 -macos/12.0 python=3.10 python_dir=/usr/local/bin provider=parallels arch=x86_64 -macos python_dir=/usr/local/bin provider=parallels arch=x86_64 -rhel/7.9 python=2.7 provider=aws arch=x86_64 -rhel/8.5 python=3.6,3.8,3.9 provider=aws arch=x86_64 -rhel/9.0 python=3.9 provider=aws arch=x86_64 -rhel provider=aws arch=x86_64 -ubuntu/22.04 python=3.10 provider=aws arch=x86_64 -ubuntu provider=aws arch=x86_64 +alpine/3.16 python=3.10 become=doas_sudo provider=aws arch=x86_64 +alpine become=doas_sudo provider=aws arch=x86_64 +fedora/36 python=3.10 become=sudo provider=aws arch=x86_64 +fedora become=sudo provider=aws arch=x86_64 +freebsd/12.3 python=3.8 python_dir=/usr/local/bin become=su_sudo provider=aws arch=x86_64 +freebsd/13.0 python=3.7,2.7,3.8,3.9 python_dir=/usr/local/bin become=su_sudo provider=aws arch=x86_64 +freebsd python_dir=/usr/local/bin become=su_sudo provider=aws arch=x86_64 +macos/12.0 python=3.10 python_dir=/usr/local/bin become=sudo provider=parallels arch=x86_64 +macos python_dir=/usr/local/bin become=sudo provider=parallels arch=x86_64 +rhel/7.9 python=2.7 become=sudo provider=aws arch=x86_64 +rhel/8.5 python=3.6,3.8,3.9 become=sudo provider=aws arch=x86_64 +rhel/9.0 python=3.9 become=sudo provider=aws arch=x86_64 +rhel become=sudo provider=aws arch=x86_64 +ubuntu/20.04 python=3.8,3.9 become=sudo provider=aws arch=x86_64 +ubuntu/22.04 python=3.10 become=sudo provider=aws arch=x86_64 +ubuntu become=sudo provider=aws arch=x86_64 diff --git a/test/lib/ansible_test/_internal/__init__.py b/test/lib/ansible_test/_internal/__init__.py index 43d10c027d..9b08833e97 100644 --- a/test/lib/ansible_test/_internal/__init__.py +++ b/test/lib/ansible_test/_internal/__init__.py @@ -11,8 +11,13 @@ from .init import ( CURRENT_RLIMIT_NOFILE, ) +from .constants import ( + STATUS_HOST_CONNECTION_ERROR, +) + from .util import ( ApplicationError, + HostConnectionError, display, ) @@ -88,6 +93,10 @@ def main(cli_args=None): # type: (t.Optional[t.List[str]]) -> None display.review_warnings() config.success = True + except HostConnectionError as ex: + display.fatal(str(ex)) + ex.run_callback() + sys.exit(STATUS_HOST_CONNECTION_ERROR) except ApplicationWarning as ex: display.warning(u'%s' % ex) sys.exit(0) diff --git a/test/lib/ansible_test/_internal/ansible_util.py b/test/lib/ansible_test/_internal/ansible_util.py index 0fe3db26ae..8ef06bfaa1 100644 --- a/test/lib/ansible_test/_internal/ansible_util.py +++ b/test/lib/ansible_test/_internal/ansible_util.py @@ -51,6 +51,10 @@ from .host_configs import ( PythonConfig, ) +from .thread import ( + mutex, +) + def parse_inventory(args, inventory_path): # type: (EnvironmentConfig, str) -> t.Dict[str, t.Any] """Return a dict parsed from the given inventory file.""" @@ -192,6 +196,7 @@ def configure_plugin_paths(args): # type: (CommonConfig) -> t.Dict[str, str] return env +@mutex def get_ansible_python_path(args): # type: (CommonConfig) -> str """ Return a directory usable for PYTHONPATH, containing only the ansible package. diff --git a/test/lib/ansible_test/_internal/become.py b/test/lib/ansible_test/_internal/become.py index dc0a208a62..5a5506a14e 100644 --- a/test/lib/ansible_test/_internal/become.py +++ b/test/lib/ansible_test/_internal/become.py @@ -5,9 +5,18 @@ import abc import shlex import typing as t +from .util import ( + get_subclasses, +) + class Become(metaclass=abc.ABCMeta): """Base class for become implementations.""" + @classmethod + def name(cls): + """The name of this plugin.""" + return cls.__name__.lower() + @property @abc.abstractmethod def method(self): # type: () -> str @@ -18,6 +27,38 @@ class Become(metaclass=abc.ABCMeta): """Return the given command, if any, with privilege escalation.""" +class Doas(Become): + """Become using 'doas'.""" + @property + def method(self): # type: () -> str + """The name of the Ansible become plugin that is equivalent to this.""" + raise NotImplementedError('Ansible has no built-in doas become plugin.') + + def prepare_command(self, command): # type: (t.List[str]) -> t.List[str] + """Return the given command, if any, with privilege escalation.""" + become = ['doas', '-n'] + + if command: + become.extend(['sh', '-c', ' '.join(shlex.quote(c) for c in command)]) + else: + become.extend(['-s']) + + return become + + +class DoasSudo(Doas): + """Become using 'doas' in ansible-test and then after bootstrapping use 'sudo' for other ansible commands.""" + @classmethod + def name(cls): + """The name of this plugin.""" + return 'doas_sudo' + + @property + def method(self): # type: () -> str + """The name of the Ansible become plugin that is equivalent to this.""" + return 'sudo' + + class Su(Become): """Become using 'su'.""" @property @@ -35,6 +76,19 @@ class Su(Become): return become +class SuSudo(Su): + """Become using 'su' in ansible-test and then after bootstrapping use 'sudo' for other ansible commands.""" + @classmethod + def name(cls): + """The name of this plugin.""" + return 'su_sudo' + + @property + def method(self): # type: () -> str + """The name of the Ansible become plugin that is equivalent to this.""" + return 'sudo' + + class Sudo(Become): """Become using 'sudo'.""" @property @@ -50,3 +104,6 @@ class Sudo(Become): become.extend(['sh', '-c', ' '.join(shlex.quote(c) for c in command)]) return become + + +SUPPORTED_BECOME_METHODS = {cls.name(): cls for cls in get_subclasses(Become)} diff --git a/test/lib/ansible_test/_internal/cgroup.py b/test/lib/ansible_test/_internal/cgroup.py new file mode 100644 index 0000000000..52779599fc --- /dev/null +++ b/test/lib/ansible_test/_internal/cgroup.py @@ -0,0 +1,110 @@ +"""Linux control group constants, classes and utilities.""" +from __future__ import annotations + +import codecs +import dataclasses +import pathlib +import re + + +class CGroupPath: + """Linux cgroup path constants.""" + ROOT = '/sys/fs/cgroup' + SYSTEMD = '/sys/fs/cgroup/systemd' + SYSTEMD_RELEASE_AGENT = '/sys/fs/cgroup/systemd/release_agent' + + +class MountType: + """Linux filesystem mount type constants.""" + TMPFS = 'tmpfs' + CGROUP_V1 = 'cgroup' + CGROUP_V2 = 'cgroup2' + + +@dataclasses.dataclass(frozen=True) +class CGroupEntry: + """A single cgroup entry parsed from '/proc/{pid}/cgroup' in the proc filesystem.""" + id: int + subsystem: str + path: pathlib.PurePosixPath + + @property + def root_path(self): + """The root path for this cgroup subsystem.""" + return pathlib.PurePosixPath(CGroupPath.ROOT, self.subsystem) + + @property + def full_path(self) -> pathlib.PurePosixPath: + """The full path for this cgroup subsystem.""" + return pathlib.PurePosixPath(self.root_path, str(self.path).lstrip('/')) + + @classmethod + def parse(cls, value: str) -> CGroupEntry: + """Parse the given cgroup line from the proc filesystem and return a cgroup entry.""" + cid, subsystem, path = value.split(':') + + return cls( + id=int(cid), + subsystem=re.sub('^name=', '', subsystem), + path=pathlib.PurePosixPath(path) + ) + + @classmethod + def loads(cls, value: str) -> tuple[CGroupEntry, ...]: + """Parse the given output from the proc filesystem and return a tuple of cgroup entries.""" + return tuple(cls.parse(line) for line in value.splitlines()) + + +@dataclasses.dataclass(frozen=True) +class MountEntry: + """A single mount info entry parsed from '/proc/{pid}/mountinfo' in the proc filesystem.""" + mount_id: int + parent_id: int + device_major: int + device_minor: int + root: pathlib.PurePosixPath + path: pathlib.PurePosixPath + options: tuple[str, ...] + fields: tuple[str, ...] + type: str + source: pathlib.PurePosixPath + super_options: tuple[str, ...] + + @classmethod + def parse(cls, value: str) -> MountEntry: + """Parse the given mount info line from the proc filesystem and return a mount entry.""" + # See: https://man7.org/linux/man-pages/man5/proc.5.html + # See: https://github.com/torvalds/linux/blob/aea23e7c464bfdec04b52cf61edb62030e9e0d0a/fs/proc_namespace.c#L135 + mount_id, parent_id, device_major_minor, root, path, options, *remainder = value.split(' ') + fields = remainder[:-4] + separator, mtype, source, super_options = remainder[-4:] + + assert separator == '-' + + device_major, device_minor = device_major_minor.split(':') + + return cls( + mount_id=int(mount_id), + parent_id=int(parent_id), + device_major=int(device_major), + device_minor=int(device_minor), + root=_decode_path(root), + path=_decode_path(path), + options=tuple(options.split(',')), + fields=tuple(fields), + type=mtype, + source=_decode_path(source), + super_options=tuple(super_options.split(',')), + ) + + @classmethod + def loads(cls, value: str) -> tuple[MountEntry, ...]: + """Parse the given output from the proc filesystem and return a tuple of mount info entries.""" + return tuple(cls.parse(line) for line in value.splitlines()) + + +def _decode_path(value: str) -> pathlib.PurePosixPath: + """Decode and return a path which may contain octal escape sequences.""" + # See: https://github.com/torvalds/linux/blob/aea23e7c464bfdec04b52cf61edb62030e9e0d0a/fs/proc_namespace.c#L150 + path = re.sub(r'(\\[0-7]{3})', lambda m: codecs.decode(m.group(0).encode('ascii'), 'unicode_escape'), value) + return pathlib.PurePosixPath(path) diff --git a/test/lib/ansible_test/_internal/cli/argparsing/parsers.py b/test/lib/ansible_test/_internal/cli/argparsing/parsers.py index cdd9956bb8..dcff978c9a 100644 --- a/test/lib/ansible_test/_internal/cli/argparsing/parsers.py +++ b/test/lib/ansible_test/_internal/cli/argparsing/parsers.py @@ -286,6 +286,19 @@ class ChoicesParser(DynamicChoicesParser): return '|'.join(self.choices) +class EnumValueChoicesParser(ChoicesParser): + """Composite argument parser which relies on a static list of choices derived from the values of an enum.""" + def __init__(self, enum_type: t.Type[enum.Enum], conditions: MatchConditions = MatchConditions.CHOICE) -> None: + self.enum_type = enum_type + + super().__init__(choices=[str(item.value) for item in enum_type], conditions=conditions) + + def parse(self, state: ParserState) -> t.Any: + """Parse the input from the given state and return the result.""" + value = super().parse(state) + return self.enum_type(value) + + class IntegerParser(DynamicChoicesParser): """Composite argument parser for integers.""" PATTERN = re.compile('^[1-9][0-9]*$') diff --git a/test/lib/ansible_test/_internal/cli/environments.py b/test/lib/ansible_test/_internal/cli/environments.py index e3e759fda5..1495f8efcd 100644 --- a/test/lib/ansible_test/_internal/cli/environments.py +++ b/test/lib/ansible_test/_internal/cli/environments.py @@ -397,6 +397,8 @@ def add_global_docker( docker_network=None, docker_terminate=None, prime_containers=False, + dev_systemd_debug=False, + dev_probe_cgroups=None, ) return @@ -428,6 +430,24 @@ def add_global_docker( help='download containers without running tests', ) + # Docker support isn't related to ansible-core-ci. + # However, ansible-core-ci support is a reasonable indicator that the user may need the `--dev-*` options. + suppress = None if get_ci_provider().supports_core_ci_auth() else argparse.SUPPRESS + + parser.add_argument( + '--dev-systemd-debug', + action='store_true', + help=suppress or 'enable systemd debugging in containers', + ) + + parser.add_argument( + '--dev-probe-cgroups', + metavar='DIR', + nargs='?', + const='', + help=suppress or 'probe container cgroups, with optional log dir', + ) + def add_environment_docker( exclusive_parser, # type: argparse.ArgumentParser diff --git a/test/lib/ansible_test/_internal/cli/parsers/key_value_parsers.py b/test/lib/ansible_test/_internal/cli/parsers/key_value_parsers.py index 8f71e7635e..820f9c4b1c 100644 --- a/test/lib/ansible_test/_internal/cli/parsers/key_value_parsers.py +++ b/test/lib/ansible_test/_internal/cli/parsers/key_value_parsers.py @@ -10,6 +10,11 @@ from ...constants import ( SUPPORTED_PYTHON_VERSIONS, ) +from ...completion import ( + AuditMode, + CGroupVersion, +) + from ...util import ( REMOTE_ARCHITECTURES, ) @@ -18,11 +23,16 @@ from ...host_configs import ( OriginConfig, ) +from ...become import ( + SUPPORTED_BECOME_METHODS, +) + from ..argparsing.parsers import ( AnyParser, BooleanParser, ChoicesParser, DocumentationState, + EnumValueChoicesParser, IntegerParser, KeyValueParser, Parser, @@ -99,6 +109,8 @@ class DockerKeyValueParser(KeyValueParser): return dict( python=PythonParser(versions=self.versions, allow_venv=False, allow_default=self.allow_default), seccomp=ChoicesParser(SECCOMP_CHOICES), + cgroup=EnumValueChoicesParser(CGroupVersion), + audit=EnumValueChoicesParser(AuditMode), privileged=BooleanParser(), memory=IntegerParser(), ) @@ -112,6 +124,8 @@ class DockerKeyValueParser(KeyValueParser): state.sections[f'{"controller" if self.controller else "target"} {section_name} (comma separated):'] = '\n'.join([ f' python={python_parser.document(state)}', f' seccomp={ChoicesParser(SECCOMP_CHOICES).document(state)}', + f' cgroup={EnumValueChoicesParser(CGroupVersion).document(state)}', + f' audit={EnumValueChoicesParser(AuditMode).document(state)}', f' privileged={BooleanParser().document(state)}', f' memory={IntegerParser().document(state)} # bytes', ]) @@ -129,6 +143,7 @@ class PosixRemoteKeyValueParser(KeyValueParser): def get_parsers(self, state): # type: (ParserState) -> t.Dict[str, Parser] """Return a dictionary of key names and value parsers.""" return dict( + become=ChoicesParser(list(SUPPORTED_BECOME_METHODS)), provider=ChoicesParser(REMOTE_PROVIDERS), arch=ChoicesParser(REMOTE_ARCHITECTURES), python=PythonParser(versions=self.versions, allow_venv=False, allow_default=self.allow_default), @@ -141,6 +156,7 @@ class PosixRemoteKeyValueParser(KeyValueParser): section_name = 'remote options' state.sections[f'{"controller" if self.controller else "target"} {section_name} (comma separated):'] = '\n'.join([ + f' become={ChoicesParser(list(SUPPORTED_BECOME_METHODS)).document(state)}', f' provider={ChoicesParser(REMOTE_PROVIDERS).document(state)}', f' arch={ChoicesParser(REMOTE_ARCHITECTURES).document(state)}', f' python={python_parser.document(state)}', diff --git a/test/lib/ansible_test/_internal/commands/env/__init__.py b/test/lib/ansible_test/_internal/commands/env/__init__.py index d8f11b87e9..41a1d52090 100644 --- a/test/lib/ansible_test/_internal/commands/env/__init__.py +++ b/test/lib/ansible_test/_internal/commands/env/__init__.py @@ -17,9 +17,9 @@ from ...io import ( from ...util import ( display, - SubprocessError, get_ansible_version, get_available_python_versions, + ApplicationError, ) from ...util_common import ( @@ -30,8 +30,8 @@ from ...util_common import ( from ...docker_util import ( get_docker_command, - docker_info, - docker_version + get_docker_info, + get_docker_container_id, ) from ...constants import ( @@ -70,11 +70,14 @@ def show_dump_env(args): # type: (EnvConfig) -> None if not args.show and not args.dump: return + container_id = get_docker_container_id() + data = dict( ansible=dict( version=get_ansible_version(), ), docker=get_docker_details(args), + container_id=container_id, environ=os.environ.copy(), location=dict( pwd=os.environ.get('PWD', None), @@ -178,14 +181,12 @@ def get_docker_details(args): # type: (EnvConfig) -> t.Dict[str, t.Any] executable = docker.executable try: - info = docker_info(args) - except SubprocessError as ex: - display.warning('Failed to collect docker info:\n%s' % ex) - - try: - version = docker_version(args) - except SubprocessError as ex: - display.warning('Failed to collect docker version:\n%s' % ex) + docker_info = get_docker_info(args) + except ApplicationError as ex: + display.warning(str(ex)) + else: + info = docker_info.info + version = docker_info.version docker_details = dict( executable=executable, diff --git a/test/lib/ansible_test/_internal/commands/integration/__init__.py b/test/lib/ansible_test/_internal/commands/integration/__init__.py index d5f497b541..a20c3ab0f8 100644 --- a/test/lib/ansible_test/_internal/commands/integration/__init__.py +++ b/test/lib/ansible_test/_internal/commands/integration/__init__.py @@ -530,6 +530,10 @@ def command_integration_filtered( if not tries: raise + if target.retry_never: + display.warning(f'Skipping retry of test target "{target.name}" since it has been excluded from retries.') + raise + display.warning('Retrying test target "%s" with maximum verbosity.' % target.name) display.verbosity = args.verbosity = 6 diff --git a/test/lib/ansible_test/_internal/commands/integration/coverage.py b/test/lib/ansible_test/_internal/commands/integration/coverage.py index 3146181afc..dd885c30f9 100644 --- a/test/lib/ansible_test/_internal/commands/integration/coverage.py +++ b/test/lib/ansible_test/_internal/commands/integration/coverage.py @@ -33,6 +33,7 @@ from ...util import ( get_type_map, remove_tree, sanitize_host_name, + verified_chmod, ) from ...util_common import ( @@ -166,9 +167,9 @@ class PosixCoverageHandler(CoverageHandler[PosixConfig]): write_text_file(coverage_config_path, coverage_config, create_directories=True) - os.chmod(coverage_config_path, MODE_FILE) + verified_chmod(coverage_config_path, MODE_FILE) os.mkdir(coverage_output_path) - os.chmod(coverage_output_path, MODE_DIRECTORY_WRITE) + verified_chmod(coverage_output_path, MODE_DIRECTORY_WRITE) def setup_target(self): """Perform setup for code coverage on the target.""" diff --git a/test/lib/ansible_test/_internal/commands/sanity/integration_aliases.py b/test/lib/ansible_test/_internal/commands/sanity/integration_aliases.py index 34ef5f8ee9..bc96b684f8 100644 --- a/test/lib/ansible_test/_internal/commands/sanity/integration_aliases.py +++ b/test/lib/ansible_test/_internal/commands/sanity/integration_aliases.py @@ -319,6 +319,9 @@ class IntegrationAliasesTest(SanitySingleVersion): messages = [] for path in unassigned_paths: + if path == 'test/integration/targets/ansible-test-container': + continue # special test target which uses group 6 -- nothing else should be in that group + messages.append(SanityMessage(unassigned_message, '%s/aliases' % path)) for path in conflicting_paths: diff --git a/test/lib/ansible_test/_internal/commands/shell/__init__.py b/test/lib/ansible_test/_internal/commands/shell/__init__.py index e62437ead7..099734df59 100644 --- a/test/lib/ansible_test/_internal/commands/shell/__init__.py +++ b/test/lib/ansible_test/_internal/commands/shell/__init__.py @@ -9,6 +9,8 @@ from ...util import ( ApplicationError, OutputStream, display, + SubprocessError, + HostConnectionError, ) from ...config import ( @@ -115,4 +117,19 @@ def command_shell(args): # type: (ShellConfig) -> None else: cmd = [] - con.run(cmd, capture=False, interactive=True) + try: + con.run(cmd, capture=False, interactive=True) + except SubprocessError as ex: + if isinstance(con, SshConnection) and ex.status == 255: + # 255 indicates SSH itself failed, rather than a command run on the remote host. + # In this case, report a host connection error so additional troubleshooting output is provided. + if not args.delegate and not args.host_path: + def callback() -> None: + """Callback to run during error display.""" + target_profile.on_target_failure() # when the controller is not delegated, report failures immediately + else: + callback = None + + raise HostConnectionError(f'SSH shell connection failed for host {target_profile.config}: {ex}', callback) from ex + + raise diff --git a/test/lib/ansible_test/_internal/completion.py b/test/lib/ansible_test/_internal/completion.py index 73aee2f25f..afa437a405 100644 --- a/test/lib/ansible_test/_internal/completion.py +++ b/test/lib/ansible_test/_internal/completion.py @@ -3,6 +3,7 @@ from __future__ import annotations import abc import dataclasses +import enum import os import typing as t @@ -21,6 +22,30 @@ from .data import ( data_context, ) +from .become import ( + SUPPORTED_BECOME_METHODS, +) + + +class CGroupVersion(enum.Enum): + """The control group version(s) required by a container.""" + NONE = 'none' + V1_ONLY = 'v1-only' + V2_ONLY = 'v2-only' + V1_V2 = 'v1-v2' + + def __repr__(self) -> str: + return f'{self.__class__.__name__}.{self.name}' + + +class AuditMode(enum.Enum): + """The audit requirements of a container.""" + NONE = 'none' + REQUIRED = 'required' + + def __repr__(self) -> str: + return f'{self.__class__.__name__}.{self.name}' + @dataclasses.dataclass(frozen=True) class CompletionConfig(metaclass=abc.ABCMeta): @@ -136,6 +161,8 @@ class DockerCompletionConfig(PythonCompletionConfig): """Configuration for Docker containers.""" image: str = '' seccomp: str = 'default' + cgroup: str = CGroupVersion.V1_V2.value + audit: str = AuditMode.REQUIRED.value # most containers need this, so the default is required, leaving it to be opt-out for containers which don't need it placeholder: bool = False @property @@ -143,6 +170,22 @@ class DockerCompletionConfig(PythonCompletionConfig): """True if the completion entry is only used for defaults, otherwise False.""" return False + @property + def audit_enum(self) -> AuditMode: + """The audit requirements for the container. Raises an exception if the value is invalid.""" + try: + return AuditMode(self.audit) + except ValueError: + raise ValueError(f'Docker completion entry "{self.name}" has an invalid value "{self.audit}" for the "audit" setting.') from None + + @property + def cgroup_enum(self) -> CGroupVersion: + """The control group version(s) required by the container. Raises an exception if the value is invalid.""" + try: + return CGroupVersion(self.cgroup) + except ValueError: + raise ValueError(f'Docker completion entry "{self.name}" has an invalid value "{self.cgroup}" for the "cgroup" setting.') from None + def __post_init__(self): if not self.image: raise Exception(f'Docker completion entry "{self.name}" must provide an "image" setting.') @@ -150,6 +193,10 @@ class DockerCompletionConfig(PythonCompletionConfig): if not self.supported_pythons and not self.placeholder: raise Exception(f'Docker completion entry "{self.name}" must provide a "python" setting.') + # verify properties can be correctly parsed to enums + assert self.audit_enum + assert self.cgroup_enum + @dataclasses.dataclass(frozen=True) class NetworkRemoteCompletionConfig(RemoteCompletionConfig): @@ -166,12 +213,16 @@ class NetworkRemoteCompletionConfig(RemoteCompletionConfig): @dataclasses.dataclass(frozen=True) class PosixRemoteCompletionConfig(RemoteCompletionConfig, PythonCompletionConfig): """Configuration for remote POSIX platforms.""" + become: t.Optional[str] = None placeholder: bool = False def __post_init__(self): if not self.placeholder: super().__post_init__() + if self.become and self.become not in SUPPORTED_BECOME_METHODS: + raise Exception(f'POSIX remote completion entry "{self.name}" setting "become" must be omitted or one of: {", ".join(SUPPORTED_BECOME_METHODS)}') + if not self.supported_pythons: if self.version and not self.placeholder: raise Exception(f'POSIX remote completion entry "{self.name}" must provide a "python" setting.') diff --git a/test/lib/ansible_test/_internal/config.py b/test/lib/ansible_test/_internal/config.py index 64e504cbd9..4061dd8ae5 100644 --- a/test/lib/ansible_test/_internal/config.py +++ b/test/lib/ansible_test/_internal/config.py @@ -111,6 +111,9 @@ class EnvironmentConfig(CommonConfig): self.delegate_args = [] # type: t.List[str] + self.dev_systemd_debug: bool = args.dev_systemd_debug + self.dev_probe_cgroups: t.Optional[str] = args.dev_probe_cgroups + def host_callback(files): # type: (t.List[t.Tuple[str, str]]) -> None """Add the host files to the payload file list.""" config = self diff --git a/test/lib/ansible_test/_internal/constants.py b/test/lib/ansible_test/_internal/constants.py index 846678516e..eec0c686d9 100644 --- a/test/lib/ansible_test/_internal/constants.py +++ b/test/lib/ansible_test/_internal/constants.py @@ -6,6 +6,8 @@ from .._util.target.common.constants import ( REMOTE_ONLY_PYTHON_VERSIONS, ) +STATUS_HOST_CONNECTION_ERROR = 4 + # Setting a low soft RLIMIT_NOFILE value will improve the performance of subprocess.Popen on Python 2.x when close_fds=True. # This will affect all Python subprocesses. It will also affect the current Python process if set before subprocess is imported for the first time. SOFT_RLIMIT_NOFILE = 1024 diff --git a/test/lib/ansible_test/_internal/containers.py b/test/lib/ansible_test/_internal/containers.py index f4d16714a6..36bef8269c 100644 --- a/test/lib/ansible_test/_internal/containers.py +++ b/test/lib/ansible_test/_internal/containers.py @@ -34,8 +34,10 @@ from .config import ( from .docker_util import ( ContainerNotFoundError, DockerInspect, + docker_create, docker_exec, docker_inspect, + docker_network_inspect, docker_pull, docker_rm, docker_run, @@ -44,6 +46,7 @@ from .docker_util import ( get_docker_host_ip, get_podman_host_ip, require_docker, + detect_host_properties, ) from .ansible_util import ( @@ -80,6 +83,10 @@ from .connections import ( SshConnection, ) +from .thread import ( + mutex, +) + # information about support containers provisioned by the current ansible-test instance support_containers = {} # type: t.Dict[str, ContainerDescriptor] support_containers_mutex = threading.Lock() @@ -138,10 +145,10 @@ def run_support_container( if current_container_id: publish_ports = False # publishing ports is pointless if already running in a docker container - options = (options or []) + ['--name', name] + options = (options or []) if start: - options.append('-d') + options.append('-dt') # the -t option is required to cause systemd in the container to log output to the console if publish_ports: for port in ports: @@ -151,6 +158,10 @@ def run_support_container( for key, value in env.items(): options.extend(['--env', '%s=%s' % (key, value)]) + max_open_files = detect_host_properties(args).max_open_files + + options.extend(['--ulimit', 'nofile=%s' % max_open_files]) + support_container_id = None if allow_existing: @@ -175,6 +186,9 @@ def run_support_container( if not support_container_id: docker_rm(args, name) + if args.dev_systemd_debug: + options.extend(('--env', 'SYSTEMD_LOG_LEVEL=debug')) + if support_container_id: display.info('Using existing "%s" container.' % name) running = True @@ -182,7 +196,7 @@ def run_support_container( else: display.info('Starting new "%s" container.' % name) docker_pull(args, image) - support_container_id = docker_run(args, image, options, create_only=not start, cmd=cmd) + support_container_id = run_container(args, image, name, options, create_only=not start, cmd=cmd) running = start existing = False @@ -220,6 +234,126 @@ def run_support_container( return descriptor +def run_container( + args: EnvironmentConfig, + image: str, + name: str, + options: t.Optional[list[str]], + cmd: t.Optional[list[str]] = None, + create_only: bool = False, +) -> str: + """Run a container using the given docker image.""" + options = list(options or []) + cmd = list(cmd or []) + + options.extend(['--name', name]) + + network = get_docker_preferred_network_name(args) + + if is_docker_user_defined_network(network): + # Only when the network is not the default bridge network. + options.extend(['--network', network]) + + for _iteration in range(1, 3): + try: + if create_only: + stdout = docker_create(args, image, options, cmd)[0] + else: + stdout = docker_run(args, image, options, cmd)[0] + except SubprocessError as ex: + display.error(ex.message) + display.warning('Failed to run docker image "{image}". Waiting a few seconds before trying again.') + docker_rm(args, name) # podman doesn't remove containers after create if run fails + time.sleep(3) + else: + if args.explain: + stdout = ''.join(random.choice('0123456789abcdef') for _iteration in range(64)) + + return stdout.strip() + + raise ApplicationError(f'Failed to run docker image "{image}".') + + +def start_container(args: EnvironmentConfig, container_id: str) -> tuple[t.Optional[str], t.Optional[str]]: + """Start a docker container by name or ID.""" + options: list[str] = [] + + for _iteration in range(1, 3): + try: + return docker_start(args, container_id, options) + except SubprocessError as ex: + display.error(ex.message) + display.warning(f'Failed to start docker container "{container_id}". Waiting a few seconds before trying again.') + time.sleep(3) + + raise ApplicationError(f'Failed to start docker container "{container_id}".') + + +def get_container_ip_address(args: EnvironmentConfig, container: DockerInspect) -> t.Optional[str]: + """Return the IP address of the container for the preferred docker network.""" + if container.networks: + network_name = get_docker_preferred_network_name(args) + + if not network_name: + # Sort networks and use the first available. + # This assumes all containers will have access to the same networks. + network_name = sorted(container.networks.keys()).pop(0) + + ipaddress = container.networks[network_name]['IPAddress'] + else: + ipaddress = container.network_settings['IPAddress'] + + if not ipaddress: + return None + + return ipaddress + + +@mutex +def get_docker_preferred_network_name(args: EnvironmentConfig) -> t.Optional[str]: + """ + Return the preferred network name for use with Docker. The selection logic is: + - the network selected by the user with `--docker-network` + - the network of the currently running docker container (if any) + - the default docker network (returns None) + """ + try: + return get_docker_preferred_network_name.network # type: ignore[attr-defined] + except AttributeError: + pass + + network = None + + if args.docker_network: + network = args.docker_network + else: + current_container_id = get_docker_container_id() + + if current_container_id: + # Make sure any additional containers we launch use the same network as the current container we're running in. + # This is needed when ansible-test is running in a container that is not connected to Docker's default network. + container = docker_inspect(args, current_container_id, always=True) + network = container.get_network_name() + + # The default docker behavior puts containers on the same network. + # The default podman behavior puts containers on isolated networks which don't allow communication between containers or network disconnect. + # Starting with podman version 2.1.0 rootless containers are able to join networks. + # Starting with podman version 2.2.0 containers can be disconnected from networks. + # To maintain feature parity with docker, detect and use the default "podman" network when running under podman. + if network is None and require_docker().command == 'podman' and docker_network_inspect(args, 'podman', always=True): + network = 'podman' + + get_docker_preferred_network_name.network = network # type: ignore[attr-defined] + + return network + + +def is_docker_user_defined_network(network: str) -> bool: + """Return True if the network being used is a user-defined network.""" + return bool(network) and network != 'bridge' + + +@mutex def get_container_database(args): # type: (EnvironmentConfig) -> ContainerDatabase """Return the current container database, creating it as needed, or returning the one provided on the command line through delegation.""" try: @@ -571,7 +705,7 @@ class ContainerDescriptor: def start(self, args): # type: (EnvironmentConfig) -> None """Start the container. Used for containers which are created, but not started.""" - docker_start(args, self.name) + start_container(args, self.name) self.register(args) @@ -581,7 +715,7 @@ class ContainerDescriptor: raise Exception('Container already registered: %s' % self.name) try: - container = docker_inspect(args, self.container_id) + container = docker_inspect(args, self.name) except ContainerNotFoundError: if not args.explain: raise @@ -598,7 +732,7 @@ class ContainerDescriptor: ), )) - support_container_ip = container.get_ip_address() + support_container_ip = get_container_ip_address(args, container) if self.publish_ports: # inspect the support container to locate the published ports @@ -663,7 +797,7 @@ def cleanup_containers(args): # type: (EnvironmentConfig) -> None if container.cleanup == CleanupMode.YES: docker_rm(args, container.container_id) elif container.cleanup == CleanupMode.INFO: - display.notice('Remember to run `docker rm -f %s` when finished testing.' % container.name) + display.notice(f'Remember to run `{require_docker().command} rm -f {container.name}` when finished testing.') def create_hosts_entries(context): # type: (t.Dict[str, ContainerAccess]) -> t.List[str] diff --git a/test/lib/ansible_test/_internal/coverage_util.py b/test/lib/ansible_test/_internal/coverage_util.py index c514079ade..869a3a3a72 100644 --- a/test/lib/ansible_test/_internal/coverage_util.py +++ b/test/lib/ansible_test/_internal/coverage_util.py @@ -41,6 +41,10 @@ from .host_configs import ( PythonConfig, ) +from .thread import ( + mutex, +) + def cover_python( args, # type: TestConfig @@ -107,6 +111,7 @@ def get_coverage_environment( return env +@mutex def get_coverage_config(args): # type: (TestConfig) -> str """Return the path to the coverage config, creating the config if it does not already exist.""" try: diff --git a/test/lib/ansible_test/_internal/delegation.py b/test/lib/ansible_test/_internal/delegation.py index 247ae3535e..b3b8ad51dc 100644 --- a/test/lib/ansible_test/_internal/delegation.py +++ b/test/lib/ansible_test/_internal/delegation.py @@ -7,6 +7,10 @@ import os import tempfile import typing as t +from .constants import ( + STATUS_HOST_CONNECTION_ERROR, +) + from .io import ( make_dirs, ) @@ -195,6 +199,7 @@ def delegate_command(args, host_state, exclude, require): # type: (EnvironmentC con.user = pytest_user success = False + status = 0 try: # When delegating, preserve the original separate stdout/stderr streams, but only when the following conditions are met: @@ -204,10 +209,17 @@ def delegate_command(args, host_state, exclude, require): # type: (EnvironmentC output_stream = OutputStream.ORIGINAL if args.display_stderr and not args.interactive else None con.run(insert_options(command, options), capture=False, interactive=args.interactive, output_stream=output_stream) success = True + except SubprocessError as ex: + status = ex.status + raise finally: if host_delegation: download_results(args, con, content_root, success) + if not success and status == STATUS_HOST_CONNECTION_ERROR: + for target in host_state.target_profiles: + target.on_target_failure() # when the controller is delegated, report failures after delegation fails + def insert_options(command, options): """Insert addition command line options into the given command and return the result.""" diff --git a/test/lib/ansible_test/_internal/dev/__init__.py b/test/lib/ansible_test/_internal/dev/__init__.py new file mode 100644 index 0000000000..e7c9b7d54f --- /dev/null +++ b/test/lib/ansible_test/_internal/dev/__init__.py @@ -0,0 +1,2 @@ +"""Development and testing support code. Enabled through the use of `--dev-*` command line options.""" +from __future__ import annotations diff --git a/test/lib/ansible_test/_internal/dev/container_probe.py b/test/lib/ansible_test/_internal/dev/container_probe.py new file mode 100644 index 0000000000..efce383d0f --- /dev/null +++ b/test/lib/ansible_test/_internal/dev/container_probe.py @@ -0,0 +1,216 @@ +"""Diagnostic utilities to probe container cgroup behavior during development and testing (both manual and integration).""" +from __future__ import annotations + +import dataclasses +import enum +import json +import os +import pathlib +import pwd +import typing as t + +from ..io import ( + read_text_file, + write_text_file, +) + +from ..util import ( + display, + ANSIBLE_TEST_TARGET_ROOT, +) + +from ..config import ( + EnvironmentConfig, +) + +from ..docker_util import ( + LOGINUID_NOT_SET, + docker_exec, + get_docker_info, + get_podman_remote, + require_docker, +) + +from ..host_configs import ( + DockerConfig, +) + +from ..cgroup import ( + CGroupEntry, + CGroupPath, + MountEntry, + MountType, +) + + +class CGroupState(enum.Enum): + """The expected state of a cgroup related mount point.""" + HOST = enum.auto() + PRIVATE = enum.auto() + SHADOWED = enum.auto() + + +@dataclasses.dataclass(frozen=True) +class CGroupMount: + """Details on a cgroup mount point that is expected to be present in the container.""" + path: str + type: t.Optional[str] + writable: t.Optional[bool] + state: t.Optional[CGroupState] + + def __post_init__(self): + assert is_relative_to(pathlib.PurePosixPath(self.path), CGroupPath.ROOT) + + if self.type is None: + assert self.state is None + elif self.type == MountType.TMPFS: + assert self.writable is True + assert self.state is None + else: + assert self.type in (MountType.CGROUP_V1, MountType.CGROUP_V2) + assert self.state is not None + + +def check_container_cgroup_status(args: EnvironmentConfig, config: DockerConfig, container_name: str, expected_mounts: tuple[CGroupMount, ...]) -> None: + """Check the running container to examine the state of the cgroup hierarchies.""" + cmd = ['sh', '-c', 'cat /proc/1/cgroup && echo && cat /proc/1/mountinfo'] + + stdout = docker_exec(args, container_name, cmd, capture=True)[0] + cgroups_stdout, mounts_stdout = stdout.split('\n\n') + + cgroups = CGroupEntry.loads(cgroups_stdout) + mounts = MountEntry.loads(mounts_stdout) + + mounts = tuple(mount for mount in mounts if is_relative_to(mount.path, CGroupPath.ROOT)) + + mount_cgroups: dict[MountEntry, CGroupEntry] = {} + probe_paths: dict[pathlib.PurePosixPath, t.Optional[str]] = {} + + for cgroup in cgroups: + if cgroup.subsystem: + mount = ([mount for mount in mounts if + mount.type == MountType.CGROUP_V1 and + is_relative_to(mount.path, cgroup.root_path) and + is_relative_to(cgroup.full_path, mount.path) + ] or [None])[-1] + else: + mount = ([mount for mount in mounts if + mount.type == MountType.CGROUP_V2 and + mount.path == cgroup.root_path + ] or [None])[-1] + + if mount: + mount_cgroups[mount] = cgroup + + for mount in mounts: + probe_paths[mount.path] = None + + if (cgroup := mount_cgroups.get(mount)) and cgroup.full_path != mount.path: # child of mount.path + probe_paths[cgroup.full_path] = None + + probe_script = read_text_file(os.path.join(ANSIBLE_TEST_TARGET_ROOT, 'setup', 'probe_cgroups.py')) + probe_command = [config.python.path, '-', f'{container_name}-probe'] + [str(path) for path in probe_paths] + probe_results = json.loads(docker_exec(args, container_name, probe_command, capture=True, data=probe_script)[0]) + + for path in probe_paths: + probe_paths[path] = probe_results[str(path)] + + remaining_mounts: dict[pathlib.PurePosixPath, MountEntry] = {mount.path: mount for mount in mounts} + results: dict[pathlib.PurePosixPath, tuple[bool, str]] = {} + + for expected_mount in expected_mounts: + expected_path = pathlib.PurePosixPath(expected_mount.path) + + if not (actual_mount := remaining_mounts.pop(expected_path, None)): + results[expected_path] = (False, 'not mounted') + continue + + actual_mount_write_error = probe_paths[actual_mount.path] + actual_mount_errors = [] + + if cgroup := mount_cgroups.get(actual_mount): + if expected_mount.state == CGroupState.SHADOWED: + actual_mount_errors.append('unexpected cgroup association') + + if cgroup.root_path == cgroup.full_path and expected_mount.state == CGroupState.HOST: + results[cgroup.root_path.joinpath('???')] = (False, 'missing cgroup') + + if cgroup.full_path == actual_mount.path: + if cgroup.root_path != cgroup.full_path and expected_mount.state == CGroupState.PRIVATE: + actual_mount_errors.append('unexpected mount') + else: + cgroup_write_error = probe_paths[cgroup.full_path] + cgroup_errors = [] + + if expected_mount.state == CGroupState.SHADOWED: + cgroup_errors.append('unexpected cgroup association') + + if cgroup.root_path != cgroup.full_path and expected_mount.state == CGroupState.PRIVATE: + cgroup_errors.append('unexpected cgroup') + + if cgroup_write_error: + cgroup_errors.append(cgroup_write_error) + + if cgroup_errors: + results[cgroup.full_path] = (False, f'directory errors: {", ".join(cgroup_errors)}') + else: + results[cgroup.full_path] = (True, 'directory (writable)') + elif expected_mount.state not in (None, CGroupState.SHADOWED): + actual_mount_errors.append('missing cgroup association') + + if actual_mount.type != expected_mount.type and expected_mount.type is not None: + actual_mount_errors.append(f'type not {expected_mount.type}') + + if bool(actual_mount_write_error) == expected_mount.writable: + actual_mount_errors.append(f'{actual_mount_write_error or "writable"}') + + if actual_mount_errors: + results[actual_mount.path] = (False, f'{actual_mount.type} errors: {", ".join(actual_mount_errors)}') + else: + results[actual_mount.path] = (True, f'{actual_mount.type} ({actual_mount_write_error or "writable"})') + + for remaining_mount in remaining_mounts.values(): + remaining_mount_write_error = probe_paths[remaining_mount.path] + + results[remaining_mount.path] = (False, f'unexpected {remaining_mount.type} mount ({remaining_mount_write_error or "writable"})') + + identity = get_identity(args, config, container_name) + messages: list[tuple[pathlib.PurePosixPath, bool, str]] = [(path, result[0], result[1]) for path, result in sorted(results.items())] + message = '\n'.join(f'{"PASS" if result else "FAIL"}: {path} -> {message}' for path, result, message in messages) + + display.info(f'>>> Container: {identity}\n{message.rstrip()}') + + if args.dev_probe_cgroups: + write_text_file(os.path.join(args.dev_probe_cgroups, f'{identity}.log'), message) + + +def get_identity(args: EnvironmentConfig, config: DockerConfig, container_name: str): + """Generate and return an identity string to use when logging test results.""" + engine = require_docker().command + + try: + loginuid = int(read_text_file('/proc/self/loginuid')) + except FileNotFoundError: + loginuid = LOGINUID_NOT_SET + + user = pwd.getpwuid(os.getuid()).pw_name + login_user = user if loginuid == LOGINUID_NOT_SET else pwd.getpwuid(loginuid).pw_name + remote = engine == 'podman' and get_podman_remote() + + tags = ( + config.name, + engine, + f'cgroup={config.cgroup.value}@{get_docker_info(args).cgroup_version}', + f'remote={remote}', + f'user={user}', + f'loginuid={login_user}', + container_name, + ) + + return '|'.join(tags) + + +def is_relative_to(first: pathlib.PurePosixPath, second: t.Union[pathlib.PurePosixPath, str]) -> bool: + """Return True if path `first` is relative to path `second`, otherwise return False.""" + second_path = pathlib.PurePosixPath(second) + return second_path == first or second_path in first.parents diff --git a/test/lib/ansible_test/_internal/docker_util.py b/test/lib/ansible_test/_internal/docker_util.py index 2ed6504bcb..dab60fb4ae 100644 --- a/test/lib/ansible_test/_internal/docker_util.py +++ b/test/lib/ansible_test/_internal/docker_util.py @@ -1,18 +1,17 @@ """Functions for accessing docker via the docker cli.""" from __future__ import annotations +import dataclasses +import enum import json import os -import random +import pathlib +import re import socket import time import urllib.parse import typing as t -from .io import ( - read_text_file, -) - from .util import ( ApplicationError, common_environment, @@ -30,7 +29,17 @@ from .util_common import ( from .config import ( CommonConfig, - EnvironmentConfig, +) + +from .thread import ( + mutex, + named_lock, +) + +from .cgroup import ( + CGroupEntry, + MountEntry, + MountType, ) DOCKER_COMMANDS = [ @@ -38,10 +47,373 @@ DOCKER_COMMANDS = [ 'podman', ] +UTILITY_IMAGE = 'quay.io/ansible/ansible-test-utility-container:2.0.0' + # Max number of open files in a docker container. # Passed with --ulimit option to the docker run command. MAX_NUM_OPEN_FILES = 10240 +# The value of /proc/*/loginuid when it is not set. +# It is a reserved UID, which is the maximum 32-bit unsigned integer value. +# See: https://access.redhat.com/solutions/25404 +LOGINUID_NOT_SET = 4294967295 + + +class DockerInfo: + """The results of `docker info` and `docker version` for the container runtime.""" + + @classmethod + def init(cls, args: CommonConfig) -> DockerInfo: + """Initialize and return a DockerInfo instance.""" + command = require_docker().command + + info_stdout = docker_command(args, ['info', '--format', '{{ json . }}'], capture=True, always=True)[0] + info = json.loads(info_stdout) + + if server_errors := info.get('ServerErrors'): + # This can occur when a remote docker instance is in use and the instance is not responding, such as when the system is still starting up. + # In that case an error such as the following may be returned: + # error during connect: Get "http://{hostname}:2375/v1.24/info": dial tcp {ip_address}:2375: connect: no route to host + raise ApplicationError('Unable to get container host information: ' + '\n'.join(server_errors)) + + version_stdout = docker_command(args, ['version', '--format', '{{ json . }}'], capture=True, always=True)[0] + version = json.loads(version_stdout) + + info = DockerInfo(args, command, info, version) + + return info + + def __init__(self, args: CommonConfig, engine: str, info: dict[str, t.Any], version: dict[str, t.Any]) -> None: + self.args = args + self.engine = engine + self.info = info + self.version = version + + @property + def client(self) -> dict[str, t.Any]: + """The client version details.""" + client = self.version.get('Client') + + if not client: + raise ApplicationError('Unable to get container host client information.') + + return client + + @property + def server(self) -> dict[str, t.Any]: + """The server version details.""" + server = self.version.get('Server') + + if not server: + if self.engine == 'podman': + # Some Podman versions always report server version info (verified with 1.8.0 and 1.9.3). + # Others do not unless Podman remote is being used. + # To provide consistency, use the client version if the server version isn't provided. + # See: https://github.com/containers/podman/issues/2671#issuecomment-804382934 + return self.client + + raise ApplicationError('Unable to get container host server information.') + + return server + + @property + def client_version(self) -> str: + """The client version.""" + return self.client['Version'] + + @property + def server_version(self) -> str: + """The server version.""" + return self.server['Version'] + + @property + def client_major_minor_version(self) -> tuple[int, int]: + """The client major and minor version.""" + major, minor = self.client_version.split('.')[:2] + return int(major), int(minor) + + @property + def server_major_minor_version(self) -> tuple[int, int]: + """The server major and minor version.""" + major, minor = self.server_version.split('.')[:2] + return int(major), int(minor) + + @property + def cgroupns_option_supported(self) -> bool: + """Return True if the `--cgroupns` option is supported, otherwise return False.""" + if self.engine == 'docker': + # Docker added support for the `--cgroupns` option in version 20.10. + # Both the client and server must support the option to use it. + # See: https://docs.docker.com/engine/release-notes/#20100 + return self.client_major_minor_version >= (20, 10) and self.server_major_minor_version >= (20, 10) + + raise NotImplementedError(self.engine) + + @property + def cgroup_version(self) -> int: + """The cgroup version of the container host.""" + info = self.info + host = info.get('host') + + # When the container host reports cgroup v1 it is running either cgroup v1 legacy mode or cgroup v2 hybrid mode. + # When the container host reports cgroup v2 it is running under cgroup v2 unified mode. + # See: https://github.com/containers/podman/blob/8356621249e36ed62fc7f35f12d17db9027ff076/libpod/info_linux.go#L52-L56 + # See: https://github.com/moby/moby/blob/d082bbcc0557ec667faca81b8b33bec380b75dac/daemon/info_unix.go#L24-L27 + + if host: + return int(host['cgroupVersion'].lstrip('v')) # podman + + try: + return int(info['CgroupVersion']) # docker + except KeyError: + pass + + # Docker 20.10 (API version 1.41) added support for cgroup v2. + # Unfortunately the client or server is too old to report the cgroup version. + # If the server is old, we can infer the cgroup version. + # Otherwise, we'll need to fall back to detection. + # See: https://docs.docker.com/engine/release-notes/#20100 + # See: https://docs.docker.com/engine/api/version-history/#v141-api-changes + + if self.server_major_minor_version < (20, 10): + return 1 # old docker server with only cgroup v1 support + + # Tell the user what versions they have and recommend they upgrade the client. + # Downgrading the server should also work, but we won't mention that. + message = ( + f'The Docker client version is {self.client_version}. ' + f'The Docker server version is {self.server_version}. ' + 'Upgrade your Docker client to version 20.10 or later.' + ) + + if detect_host_properties(self.args).cgroup_v2: + # Unfortunately cgroup v2 was detected on the Docker server. + # A newer client is needed to support the `--cgroupns` option for use with cgroup v2. + raise ApplicationError(f'Unsupported Docker client and server combination using cgroup v2. {message}') + + display.warning(f'Detected Docker server cgroup v1 using probing. {message}', unique=True) + + return 1 # docker server is using cgroup v1 (or cgroup v2 hybrid) + + @property + def docker_desktop_wsl2(self) -> bool: + """Return True if Docker Desktop integrated with WSL2 is detected, otherwise False.""" + info = self.info + + kernel_version = info.get('KernelVersion') + operating_system = info.get('OperatingSystem') + + dd_wsl2 = kernel_version and kernel_version.endswith('-WSL2') and operating_system == 'Docker Desktop' + + return dd_wsl2 + + @property + def description(self) -> str: + """Describe the container runtime.""" + tags = dict( + client=self.client_version, + server=self.server_version, + cgroup=f'v{self.cgroup_version}', + ) + + labels = [self.engine] + [f'{key}={value}' for key, value in tags.items()] + + if self.docker_desktop_wsl2: + labels.append('DD+WSL2') + + return f'Container runtime: {" ".join(labels)}' + + +@mutex +def get_docker_info(args: CommonConfig) -> DockerInfo: + """Return info for the current container runtime. The results are cached.""" + try: + return get_docker_info.info # type: ignore[attr-defined] + except AttributeError: + pass + + info = DockerInfo.init(args) + + display.info(info.description, verbosity=1) + + get_docker_info.info = info # type: ignore[attr-defined] + + return info + + +class SystemdControlGroupV1Status(enum.Enum): + """The state of the cgroup v1 systemd hierarchy on the container host.""" + SUBSYSTEM_MISSING = 'The systemd cgroup subsystem was not found.' + FILESYSTEM_NOT_MOUNTED = 'The "/sys/fs/cgroup/systemd" filesystem is not mounted.' + MOUNT_TYPE_NOT_CORRECT = 'The "/sys/fs/cgroup/systemd" mount type is not correct.' + VALID = 'The "/sys/fs/cgroup/systemd" mount is valid.' + + +@dataclasses.dataclass(frozen=True) +class ContainerHostProperties: + """Container host properties detected at run time.""" + audit_code: str + max_open_files: int + loginuid: t.Optional[int] + cgroups: tuple[CGroupEntry, ...] + mounts: tuple[MountEntry, ...] + cgroup_v1: SystemdControlGroupV1Status + cgroup_v2: bool + + +@mutex +def detect_host_properties(args: CommonConfig) -> ContainerHostProperties: + """ + Detect and return properties of the container host. + + The information collected is: + + - The errno result from attempting to query the container host's audit status. + - The max number of open files supported by the container host to run containers. + This value may be capped to the maximum value used by ansible-test. + If the value is below the desired limit, a warning is displayed. + - The loginuid used by the container host to run containers, or None if the audit subsystem is unavailable. + - The cgroup subsystems registered with the Linux kernel. + - The mounts visible within a container. + - The status of the systemd cgroup v1 hierarchy. + + This information is collected together to reduce the number of container runs to probe the container host. + """ + try: + return detect_host_properties.properties # type: ignore[attr-defined] + except AttributeError: + pass + + single_line_commands = ( + 'audit-status', + 'cat /proc/sys/fs/nr_open', + 'ulimit -Hn', + '(cat /proc/1/loginuid; echo)', + ) + + multi_line_commands = ( + ' && '.join(single_line_commands), + 'cat /proc/1/cgroup', + 'cat /proc/1/mountinfo', + ) + + options = ['--volume', '/sys/fs/cgroup:/probe:ro'] + cmd = ['sh', '-c', ' && echo "-" && '.join(multi_line_commands)] + + stdout = run_utility_container(args, f'ansible-test-probe-{args.session_name}', cmd, options)[0] + blocks = stdout.split('\n-\n') + + values = blocks[0].split('\n') + + audit_parts = values[0].split(' ', 1) + audit_status = int(audit_parts[0]) + audit_code = audit_parts[1] + + system_limit = int(values[1]) + hard_limit = int(values[2]) + loginuid = int(values[3]) if values[3] else None + + cgroups = CGroupEntry.loads(blocks[1]) + mounts = MountEntry.loads(blocks[2]) + + if hard_limit < MAX_NUM_OPEN_FILES and hard_limit < system_limit and require_docker().command == 'docker': + # Podman will use the highest possible limits, up to its default of 1M. + # See: https://github.com/containers/podman/blob/009afb50b308548eb129bc68e654db6c6ad82e7a/pkg/specgen/generate/oci.go#L39-L58 + # Docker limits are less predictable. They could be the system limit or the user's soft limit. + # If Docker is running as root it should be able to use the system limit. + # When Docker reports a limit below the preferred value and the system limit, attempt to use the preferred value, up to the system limit. + options = ['--ulimit', f'nofile={min(system_limit, MAX_NUM_OPEN_FILES)}'] + cmd = ['sh', '-c', 'ulimit -Hn'] + + try: + stdout = run_utility_container(args, f'ansible-test-ulimit-{args.session_name}', cmd, options)[0] + except SubprocessError as ex: + display.warning(str(ex)) + else: + hard_limit = int(stdout) + + # Check the audit error code from attempting to query the container host's audit status. + # + # The following error codes are known to occur: + # + # EPERM - Operation not permitted + # This occurs when the root user runs a container but lacks the AUDIT_WRITE capability. + # This will cause patched versions of OpenSSH to disconnect after a login succeeds. + # See: https://src.fedoraproject.org/rpms/openssh/blob/f36/f/openssh-7.6p1-audit.patch + # + # EBADF - Bad file number + # This occurs when the host doesn't support the audit system (the open_audit call fails). + # This allows SSH logins to succeed despite the failure. + # See: https://github.com/Distrotech/libaudit/blob/4fc64f79c2a7f36e3ab7b943ce33ab5b013a7782/lib/netlink.c#L204-L209 + # + # ECONNREFUSED - Connection refused + # This occurs when a non-root user runs a container without the AUDIT_WRITE capability. + # When sending an audit message, libaudit ignores this error condition. + # This allows SSH logins to succeed despite the failure. + # See: https://github.com/Distrotech/libaudit/blob/4fc64f79c2a7f36e3ab7b943ce33ab5b013a7782/lib/deprecated.c#L48-L52 + + subsystems = set(cgroup.subsystem for cgroup in cgroups) + mount_types = {mount.path: mount.type for mount in mounts} + + if 'systemd' not in subsystems: + cgroup_v1 = SystemdControlGroupV1Status.SUBSYSTEM_MISSING + elif not (mount_type := mount_types.get(pathlib.PurePosixPath('/probe/systemd'))): + cgroup_v1 = SystemdControlGroupV1Status.FILESYSTEM_NOT_MOUNTED + elif mount_type != MountType.CGROUP_V1: + cgroup_v1 = SystemdControlGroupV1Status.MOUNT_TYPE_NOT_CORRECT + else: + cgroup_v1 = SystemdControlGroupV1Status.VALID + + cgroup_v2 = mount_types.get(pathlib.PurePosixPath('/probe')) == MountType.CGROUP_V2 + + display.info(f'Container host audit status: {audit_code} ({audit_status})', verbosity=1) + display.info(f'Container host max open files: {hard_limit}', verbosity=1) + display.info(f'Container loginuid: {loginuid if loginuid is not None else "unavailable"}' + f'{" (not set)" if loginuid == LOGINUID_NOT_SET else ""}', verbosity=1) + + if hard_limit < MAX_NUM_OPEN_FILES: + display.warning(f'Unable to set container max open files to {MAX_NUM_OPEN_FILES}. Using container host limit of {hard_limit} instead.') + else: + hard_limit = MAX_NUM_OPEN_FILES + + properties = ContainerHostProperties( + # The errno (audit_status) is intentionally not exposed here, as it can vary across systems and architectures. + # Instead, the symbolic name (audit_code) is used, which is resolved inside the container which generated the error. + # See: https://man7.org/linux/man-pages/man3/errno.3.html + audit_code=audit_code, + max_open_files=hard_limit, + loginuid=loginuid, + cgroups=cgroups, + mounts=mounts, + cgroup_v1=cgroup_v1, + cgroup_v2=cgroup_v2, + ) + + detect_host_properties.properties = properties # type: ignore[attr-defined] + + return properties + + +def run_utility_container( + args: CommonConfig, + name: str, + cmd: list[str], + options: list[str], + data: t.Optional[str] = None, +) -> tuple[t.Optional[str], t.Optional[str]]: + """Run the specified command using the ansible-test utility container, returning stdout and stderr.""" + options = options + [ + '--name', name, + '--rm', + ] + + if data: + options.append('-i') + + docker_pull(args, UTILITY_IMAGE) + + return docker_run(args, UTILITY_IMAGE, options, cmd, data) + class DockerCommand: """Details about the available docker command.""" @@ -62,7 +434,7 @@ class DockerCommand: executable = find_executable(command, required=False) if executable: - version = raw_command([command, '-v'], capture=True)[0].strip() + version = raw_command([command, '-v'], env=docker_environment(), capture=True)[0].strip() if command == 'docker' and 'podman' in version: continue # avoid detecting podman as docker @@ -133,14 +505,14 @@ def get_podman_host_ip(): # type: () -> str @cache -def get_podman_default_hostname(): # type: () -> str +def get_podman_default_hostname(): # type: () -> t.Optional[str] """Return the default hostname of the Podman service. --format was added in podman 3.3.0, this functionality depends on it's availability """ hostname = None try: - stdout = raw_command(['podman', 'system', 'connection', 'list', '--format=json'], capture=True)[0] + stdout = raw_command(['podman', 'system', 'connection', 'list', '--format=json'], env=docker_environment(), capture=True)[0] except SubprocessError: stdout = '[]' @@ -159,7 +531,8 @@ def get_podman_default_hostname(): # type: () -> str @cache -def _get_podman_remote(): # type: () -> t.Optional[str] +def get_podman_remote(): # type: () -> t.Optional[str] + """Return the remote podman hostname, if any, otherwise return None.""" # URL value resolution precedence: # - command line value # - environment variable CONTAINER_HOST @@ -184,7 +557,7 @@ def _get_podman_remote(): # type: () -> t.Optional[str] @cache def get_podman_hostname(): # type: () -> str """Return the hostname of the Podman service.""" - hostname = _get_podman_remote() + hostname = get_podman_remote() if not hostname: hostname = 'localhost' @@ -196,162 +569,141 @@ def get_podman_hostname(): # type: () -> str @cache def get_docker_container_id(): # type: () -> t.Optional[str] """Return the current container ID if running in a container, otherwise return None.""" - path = '/proc/self/cpuset' + mountinfo_path = pathlib.Path('/proc/self/mountinfo') container_id = None - - if os.path.exists(path): - # File content varies based on the environment: - # No Container: / - # Docker: /docker/c86f3732b5ba3d28bb83b6e14af767ab96abbc52de31313dcb1176a62d91a507 - # Azure Pipelines (Docker): /azpl_job/0f2edfed602dd6ec9f2e42c867f4d5ee640ebf4c058e6d3196d4393bb8fd0891 - # Podman: /../../../../../.. - contents = read_text_file(path) - - cgroup_path, cgroup_name = os.path.split(contents.strip()) - - if cgroup_path in ('/docker', '/azpl_job'): - container_id = cgroup_name + engine = None + + if mountinfo_path.is_file(): + # NOTE: This method of detecting the container engine and container ID relies on implementation details of each container engine. + # Although the implementation details have remained unchanged for some time, there is no guarantee they will continue to work. + # There have been proposals to create a standard mechanism for this, but none is currently available. + # See: https://github.com/opencontainers/runtime-spec/issues/1105 + + mounts = MountEntry.loads(mountinfo_path.read_text()) + + for mount in mounts: + if str(mount.path) == '/etc/hostname': + # Podman generates /etc/hostname in the makePlatformBindMounts function. + # That function ends up using ContainerRunDirectory to generate a path like: {prefix}/{container_id}/userdata/hostname + # NOTE: The {prefix} portion of the path can vary, so should not be relied upon. + # See: https://github.com/containers/podman/blob/480c7fbf5361f3bd8c1ed81fe4b9910c5c73b186/libpod/container_internal_linux.go#L660-L664 + # See: https://github.com/containers/podman/blob/480c7fbf5361f3bd8c1ed81fe4b9910c5c73b186/vendor/github.com/containers/storage/store.go#L3133 + # This behavior has existed for ~5 years and was present in Podman version 0.2. + # See: https://github.com/containers/podman/pull/248 + if match := re.search('/(?P[0-9a-f]{64})/userdata/hostname$', str(mount.root)): + container_id = match.group('id') + engine = 'Podman' + break + + # Docker generates /etc/hostname in the BuildHostnameFile function. + # That function ends up using the containerRoot function to generate a path like: {prefix}/{container_id}/hostname + # NOTE: The {prefix} portion of the path can vary, so should not be relied upon. + # See: https://github.com/moby/moby/blob/cd8a090e6755bee0bdd54ac8a894b15881787097/container/container_unix.go#L58 + # See: https://github.com/moby/moby/blob/92e954a2f05998dc05773b6c64bbe23b188cb3a0/daemon/container.go#L86 + # This behavior has existed for at least ~7 years and was present in Docker version 1.0.1. + # See: https://github.com/moby/moby/blob/v1.0.1/daemon/container.go#L351 + # See: https://github.com/moby/moby/blob/v1.0.1/daemon/daemon.go#L133 + if match := re.search('/(?P[0-9a-f]{64})/hostname$', str(mount.root)): + container_id = match.group('id') + engine = 'Docker' + break if container_id: - display.info('Detected execution in Docker container: %s' % container_id, verbosity=1) + display.info(f'Detected execution in {engine} container ID: {container_id}', verbosity=1) return container_id -def get_docker_preferred_network_name(args): # type: (EnvironmentConfig) -> str - """ - Return the preferred network name for use with Docker. The selection logic is: - - the network selected by the user with `--docker-network` - - the network of the currently running docker container (if any) - - the default docker network (returns None) - """ - try: - return get_docker_preferred_network_name.network # type: ignore[attr-defined] - except AttributeError: - pass - - network = None - - if args.docker_network: - network = args.docker_network - else: - current_container_id = get_docker_container_id() - - if current_container_id: - # Make sure any additional containers we launch use the same network as the current container we're running in. - # This is needed when ansible-test is running in a container that is not connected to Docker's default network. - container = docker_inspect(args, current_container_id, always=True) - network = container.get_network_name() - - get_docker_preferred_network_name.network = network # type: ignore[attr-defined] - - return network - - -def is_docker_user_defined_network(network): # type: (str) -> bool - """Return True if the network being used is a user-defined network.""" - return bool(network) and network != 'bridge' - - -def docker_pull(args, image): # type: (EnvironmentConfig, str) -> None +def docker_pull(args, image): # type: (CommonConfig, str) -> None """ Pull the specified image if it is not available. Images without a tag or digest will not be pulled. Retries up to 10 times if the pull fails. + A warning will be shown for any image with volumes defined. + Images will be pulled only once. + Concurrent pulls for the same image will block until the first completes. """ + with named_lock(f'docker_pull:{image}') as first: + if first: + __docker_pull(args, image) + + +def __docker_pull(args: CommonConfig, image: str) -> None: + """Internal implementation for docker_pull. Do not call directly.""" if '@' not in image and ':' not in image: display.info('Skipping pull of image without tag or digest: %s' % image, verbosity=2) - return - - if docker_image_exists(args, image): + inspect = docker_image_inspect(args, image) + elif inspect := docker_image_inspect(args, image, always=True): display.info('Skipping pull of existing image: %s' % image, verbosity=2) - return - - for _iteration in range(1, 10): - try: - docker_command(args, ['pull', image], capture=False) - return - except SubprocessError: - display.warning('Failed to pull docker image "%s". Waiting a few seconds before trying again.' % image) - time.sleep(3) - - raise ApplicationError('Failed to pull docker image "%s".' % image) - - -def docker_cp_to(args, container_id, src, dst): # type: (EnvironmentConfig, str, str, str) -> None - """Copy a file to the specified container.""" - docker_command(args, ['cp', src, '%s:%s' % (container_id, dst)], capture=True) - - -def docker_run( - args, # type: EnvironmentConfig - image, # type: str - options, # type: t.Optional[t.List[str]] - cmd=None, # type: t.Optional[t.List[str]] - create_only=False, # type: bool -): # type: (...) -> str - """Run a container using the given docker image.""" - if not options: - options = [] - - if not cmd: - cmd = [] - - if create_only: - command = 'create' else: - command = 'run' + for _iteration in range(1, 10): + try: + docker_command(args, ['pull', image], capture=False) - network = get_docker_preferred_network_name(args) + if (inspect := docker_image_inspect(args, image)) or args.explain: + break - if is_docker_user_defined_network(network): - # Only when the network is not the default bridge network. - options.extend(['--network', network]) + display.warning(f'Image "{image}" not found after pull completed. Waiting a few seconds before trying again.') + except SubprocessError: + display.warning(f'Failed to pull container image "{image}". Waiting a few seconds before trying again.') + time.sleep(3) + else: + raise ApplicationError(f'Failed to pull container image "{image}".') - options.extend(['--ulimit', 'nofile=%s' % MAX_NUM_OPEN_FILES]) + if inspect and inspect.volumes: + display.warning(f'Image "{image}" contains {len(inspect.volumes)} volume(s): {", ".join(sorted(inspect.volumes))}\n' + 'This may result in leaking anonymous volumes. It may also prevent the image from working on some hosts or container engines.\n' + 'The image should be rebuilt without the use of the VOLUME instruction.', + unique=True) - for _iteration in range(1, 3): - try: - stdout = docker_command(args, [command] + options + [image] + cmd, capture=True)[0] - if args.explain: - return ''.join(random.choice('0123456789abcdef') for _iteration in range(64)) +def docker_cp_to(args, container_id, src, dst): # type: (CommonConfig, str, str, str) -> None + """Copy a file to the specified container.""" + docker_command(args, ['cp', src, '%s:%s' % (container_id, dst)], capture=True) - return stdout.strip() - except SubprocessError as ex: - display.error(ex.message) - display.warning('Failed to run docker image "%s". Waiting a few seconds before trying again.' % image) - time.sleep(3) - raise ApplicationError('Failed to run docker image "%s".' % image) +def docker_create( + args: CommonConfig, + image: str, + options: list[str], + cmd: list[str] = None, +) -> tuple[t.Optional[str], t.Optional[str]]: + """Create a container using the given docker image.""" + return docker_command(args, ['create'] + options + [image] + cmd, capture=True) -def docker_start(args, container_id, options=None): # type: (EnvironmentConfig, str, t.Optional[t.List[str]]) -> t.Tuple[t.Optional[str], t.Optional[str]] - """ - Start a docker container by name or ID - """ - if not options: - options = [] +def docker_run( + args: CommonConfig, + image: str, + options: list[str], + cmd: list[str] = None, + data: t.Optional[str] = None, +) -> tuple[t.Optional[str], t.Optional[str]]: + """Run a container using the given docker image.""" + return docker_command(args, ['run'] + options + [image] + cmd, data=data, capture=True) - for _iteration in range(1, 3): - try: - return docker_command(args, ['start'] + options + [container_id], capture=True) - except SubprocessError as ex: - display.error(ex.message) - display.warning('Failed to start docker container "%s". Waiting a few seconds before trying again.' % container_id) - time.sleep(3) - raise ApplicationError('Failed to run docker container "%s".' % container_id) +def docker_start( + args: CommonConfig, + container_id: str, + options: list[str], +) -> tuple[t.Optional[str], t.Optional[str]]: + """Start a container by name or ID.""" + return docker_command(args, ['start'] + options + [container_id], capture=True) -def docker_rm(args, container_id): # type: (EnvironmentConfig, str) -> None +def docker_rm(args, container_id): # type: (CommonConfig, str) -> None """Remove the specified container.""" try: - docker_command(args, ['rm', '-f', container_id], capture=True) + # Stop the container with SIGKILL immediately, then remove the container. + # Podman supports the `--time` option on `rm`, but only since version 4.0.0. + # Docker does not support the `--time` option on `rm`. + docker_command(args, ['stop', '--time', '0', container_id], capture=True) + docker_command(args, ['rm', container_id], capture=True) except SubprocessError as ex: - if 'no such container' in ex.stderr: - pass # podman does not handle this gracefully, exits 1 - else: + # Both Podman and Docker report an error if the container does not exist. + # The error messages contain the same "no such container" string, differing only in capitalization. + if 'no such container' not in ex.stderr.lower(): raise ex @@ -369,7 +721,7 @@ class ContainerNotFoundError(DockerError): class DockerInspect: """The results of `docker inspect` for a single container.""" - def __init__(self, args, inspection): # type: (EnvironmentConfig, t.Dict[str, t.Any]) -> None + def __init__(self, args, inspection): # type: (CommonConfig, t.Dict[str, t.Any]) -> None self.args = args self.inspection = inspection @@ -412,6 +764,11 @@ class DockerInspect: """Return True if the container is running, otherwise False.""" return self.state['Running'] + @property + def pid(self) -> int: + """Return the PID of the init process.""" + return self.state['Pid'] + @property def env(self): # type: () -> t.List[str] """Return a list of the environment variables used to create the container.""" @@ -451,27 +808,8 @@ class DockerInspect: return networks[0] - def get_ip_address(self): # type: () -> t.Optional[str] - """Return the IP address of the container for the preferred docker network.""" - if self.networks: - network_name = get_docker_preferred_network_name(self.args) - if not network_name: - # Sort networks and use the first available. - # This assumes all containers will have access to the same networks. - network_name = sorted(self.networks.keys()).pop(0) - - ipaddress = self.networks[network_name]['IPAddress'] - else: - ipaddress = self.network_settings['IPAddress'] - - if not ipaddress: - return None - - return ipaddress - - -def docker_inspect(args, identifier, always=False): # type: (EnvironmentConfig, str, bool) -> DockerInspect +def docker_inspect(args, identifier, always=False): # type: (CommonConfig, str, bool) -> DockerInspect """ Return the results of `docker container inspect` for the specified container. Raises a ContainerNotFoundError if the container was not found. @@ -492,23 +830,110 @@ def docker_inspect(args, identifier, always=False): # type: (EnvironmentConfig, raise ContainerNotFoundError(identifier) -def docker_network_disconnect(args, container_id, network): # type: (EnvironmentConfig, str, str) -> None +def docker_network_disconnect(args, container_id, network): # type: (CommonConfig, str, str) -> None """Disconnect the specified docker container from the given network.""" docker_command(args, ['network', 'disconnect', network, container_id], capture=True) -def docker_image_exists(args, image): # type: (EnvironmentConfig, str) -> bool - """Return True if the image exists, otherwise False.""" +class DockerImageInspect: + """The results of `docker image inspect` for a single image.""" + def __init__(self, args: CommonConfig, inspection: dict[str, t.Any]) -> None: + self.args = args + self.inspection = inspection + + # primary properties + + @property + def config(self) -> dict[str, t.Any]: + """Return a dictionary of the image config.""" + return self.inspection['Config'] + + # nested properties + + @property + def volumes(self) -> dict[str, t.Any]: + """Return a dictionary of the image volumes.""" + return self.config.get('Volumes') or {} + + @property + def cmd(self) -> list[str]: + """The command to run when the container starts.""" + return self.config['Cmd'] + + +@mutex +def docker_image_inspect(args: CommonConfig, image: str, always: bool = False) -> t.Optional[DockerImageInspect]: + """ + Return the results of `docker image inspect` for the specified image or None if the image does not exist. + """ + inspect_cache: dict[str, DockerImageInspect] + + try: + inspect_cache = docker_image_inspect.cache # type: ignore[attr-defined] + except AttributeError: + inspect_cache = docker_image_inspect.cache = {} # type: ignore[attr-defined] + + if inspect_result := inspect_cache.get(image): + return inspect_result + + try: + stdout = docker_command(args, ['image', 'inspect', image], capture=True, always=always)[0] + except SubprocessError: + stdout = '[]' + + if args.explain and not always: + items = [] + else: + items = json.loads(stdout) + + if len(items) > 1: + raise ApplicationError(f'Inspection of image "{image}" resulted in {len(items)} items:\n{json.dumps(items, indent=4)}') + + if len(items) == 1: + inspect_result = DockerImageInspect(args, items[0]) + inspect_cache[image] = inspect_result + return inspect_result + + return None + + +class DockerNetworkInspect: + """The results of `docker network inspect` for a single network.""" + def __init__(self, args: CommonConfig, inspection: dict[str, t.Any]) -> None: + self.args = args + self.inspection = inspection + + +def docker_network_inspect(args: CommonConfig, network: str, always: bool = False) -> t.Optional[DockerNetworkInspect]: + """ + Return the results of `docker network inspect` for the specified network or None if the network does not exist. + """ try: - docker_command(args, ['image', 'inspect', image], capture=True) + stdout = docker_command(args, ['network', 'inspect', network], capture=True, always=always)[0] except SubprocessError: - return False + stdout = '[]' + + if args.explain and not always: + items = [] + else: + items = json.loads(stdout) + + if len(items) == 1: + return DockerNetworkInspect(args, items[0]) + + return None - return True + +def docker_logs(args: CommonConfig, container_id: str) -> None: + """Display logs for the specified container. If an error occurs, it is displayed rather than raising an exception.""" + try: + docker_command(args, ['logs', container_id], capture=False) + except SubprocessError as ex: + display.error(str(ex)) def docker_exec( - args, # type: EnvironmentConfig + args, # type: CommonConfig container_id, # type: str cmd, # type: t.List[str] capture, # type: bool @@ -530,18 +955,6 @@ def docker_exec( output_stream=output_stream, data=data) -def docker_info(args): # type: (CommonConfig) -> t.Dict[str, t.Any] - """Return a dictionary containing details from the `docker info` command.""" - stdout, _dummy = docker_command(args, ['info', '--format', '{{json .}}'], capture=True, always=True) - return json.loads(stdout) - - -def docker_version(args): # type: (CommonConfig) -> t.Dict[str, t.Any] - """Return a dictionary containing details from the `docker version` command.""" - stdout, _dummy = docker_command(args, ['version', '--format', '{{json .}}'], capture=True, always=True) - return json.loads(stdout) - - def docker_command( args, # type: CommonConfig cmd, # type: t.List[str] @@ -557,7 +970,7 @@ def docker_command( env = docker_environment() command = [require_docker().command] - if command[0] == 'podman' and _get_podman_remote(): + if command[0] == 'podman' and get_podman_remote(): command.append('--remote') return run_command(args, command + cmd, env=env, capture=capture, stdin=stdin, stdout=stdout, interactive=interactive, always=always, @@ -567,5 +980,16 @@ def docker_command( def docker_environment(): # type: () -> t.Dict[str, str] """Return a dictionary of docker related environment variables found in the current environment.""" env = common_environment() - env.update(dict((key, os.environ[key]) for key in os.environ if key.startswith('DOCKER_') or key.startswith('CONTAINER_'))) + + var_names = { + 'XDG_RUNTIME_DIR', # podman + } + + var_prefixes = { + 'CONTAINER_', # podman remote + 'DOCKER_', # docker + } + + env.update({name: value for name, value in os.environ.items() if name in var_names or any(name.startswith(prefix) for prefix in var_prefixes)}) + return env diff --git a/test/lib/ansible_test/_internal/host_configs.py b/test/lib/ansible_test/_internal/host_configs.py index 11a4506465..e69e706542 100644 --- a/test/lib/ansible_test/_internal/host_configs.py +++ b/test/lib/ansible_test/_internal/host_configs.py @@ -18,6 +18,8 @@ from .io import ( ) from .completion import ( + AuditMode, + CGroupVersion, CompletionConfig, docker_completion, DockerCompletionConfig, @@ -282,6 +284,8 @@ class DockerConfig(ControllerHostConfig, PosixConfig): memory: t.Optional[int] = None privileged: t.Optional[bool] = None seccomp: t.Optional[str] = None + cgroup: t.Optional[CGroupVersion] = None + audit: t.Optional[AuditMode] = None def get_defaults(self, context): # type: (HostContext) -> DockerCompletionConfig """Return the default settings.""" @@ -313,6 +317,12 @@ class DockerConfig(ControllerHostConfig, PosixConfig): if self.seccomp is None: self.seccomp = defaults.seccomp + if self.cgroup is None: + self.cgroup = defaults.cgroup_enum + + if self.audit is None: + self.audit = defaults.audit_enum + if self.privileged is None: self.privileged = False @@ -333,6 +343,8 @@ class DockerConfig(ControllerHostConfig, PosixConfig): @dataclasses.dataclass class PosixRemoteConfig(RemoteConfig, ControllerHostConfig, PosixConfig): """Configuration for a POSIX remote host.""" + become: t.Optional[str] = None + def get_defaults(self, context): # type: (HostContext) -> PosixRemoteCompletionConfig """Return the default settings.""" return filter_completion(remote_completion()).get(self.name) or remote_completion().get(self.platform) or PosixRemoteCompletionConfig( @@ -350,6 +362,14 @@ class PosixRemoteConfig(RemoteConfig, ControllerHostConfig, PosixConfig): return [ControllerConfig(python=NativePythonConfig(version=version, path=path)) for version, path in pythons.items()] + def apply_defaults(self, context, defaults): # type: (HostContext, CompletionConfig) -> None + """Apply default settings.""" + assert isinstance(defaults, PosixRemoteCompletionConfig) + + super().apply_defaults(context, defaults) + + self.become = self.become or defaults.become + @property def have_root(self): # type: () -> bool """True if root is available, otherwise False.""" diff --git a/test/lib/ansible_test/_internal/host_profiles.py b/test/lib/ansible_test/_internal/host_profiles.py index b15d7d356e..d1d93a5231 100644 --- a/test/lib/ansible_test/_internal/host_profiles.py +++ b/test/lib/ansible_test/_internal/host_profiles.py @@ -4,11 +4,13 @@ from __future__ import annotations import abc import dataclasses import os +import shlex import tempfile import time import typing as t from .io import ( + read_text_file, write_text_file, ) @@ -52,16 +54,28 @@ from .util import ( sanitize_host_name, sorted_versions, InternalError, + HostConnectionError, + ANSIBLE_TEST_TARGET_ROOT, ) from .util_common import ( + get_docs_url, intercept_python, ) from .docker_util import ( docker_exec, + docker_image_inspect, + docker_logs, + docker_pull, docker_rm, get_docker_hostname, + require_docker, + get_docker_info, + detect_host_properties, + run_utility_container, + SystemdControlGroupV1Status, + LOGINUID_NOT_SET, ) from .bootstrap import ( @@ -99,16 +113,70 @@ from .connections import ( from .become import ( Become, - Su, + SUPPORTED_BECOME_METHODS, Sudo, ) +from .completion import ( + AuditMode, + CGroupVersion, +) + +from .dev.container_probe import ( + CGroupMount, + CGroupPath, + CGroupState, + MountType, + check_container_cgroup_status, +) + TControllerHostConfig = t.TypeVar('TControllerHostConfig', bound=ControllerHostConfig) THostConfig = t.TypeVar('THostConfig', bound=HostConfig) TPosixConfig = t.TypeVar('TPosixConfig', bound=PosixConfig) TRemoteConfig = t.TypeVar('TRemoteConfig', bound=RemoteConfig) +class ControlGroupError(ApplicationError): + """Raised when the container host does not have the necessary cgroup support to run a container.""" + def __init__(self, args: CommonConfig, reason: str) -> None: + engine = require_docker().command + dd_wsl2 = get_docker_info(args).docker_desktop_wsl2 + + message = f''' +{reason} + +Run the following commands as root on the container host to resolve this issue: + + mkdir /sys/fs/cgroup/systemd + mount cgroup -t cgroup /sys/fs/cgroup/systemd -o none,name=systemd,xattr + chown -R {{user}}:{{group}} /sys/fs/cgroup/systemd # only when rootless + +NOTE: These changes must be applied each time the container host is rebooted. +'''.strip() + + podman_message = ''' + If rootless Podman is already running [1], you may need to stop it before + containers are able to use the new mount point. + +[1] Check for 'podman' and 'catatonit' processes. +''' + + dd_wsl_message = f''' + When using Docker Desktop with WSL2, additional configuration [1] is required. + +[1] {get_docs_url("https://docs.ansible.com/ansible-core/devel/dev_guide/testing_running_locally.html#docker-desktop-with-wsl2")} +''' + + if engine == 'podman': + message += podman_message + elif dd_wsl2: + message += dd_wsl_message + + message = message.strip() + + super().__init__(message) + + @dataclasses.dataclass(frozen=True) class Inventory: """Simple representation of an Ansible inventory.""" @@ -179,6 +247,9 @@ class HostProfile(t.Generic[THostConfig], metaclass=abc.ABCMeta): def setup(self): # type: () -> None """Perform out-of-band setup before delegation.""" + def on_target_failure(self) -> None: + """Executed during failure handling if this profile is a target.""" + def deprovision(self): # type: () -> None """Deprovision the host after delegation has completed.""" @@ -331,6 +402,16 @@ class ControllerProfile(SshTargetHostProfile[ControllerConfig], PosixProfile[Con class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[DockerConfig]): """Host profile for a docker instance.""" + + MARKER = 'ansible-test-marker' + + @dataclasses.dataclass(frozen=True) + class InitConfig: + """Configuration details required to run the container init.""" + options: list[str] + command: str + expected_mounts: tuple[CGroupMount, ...] + @property def container_name(self): # type: () -> t.Optional[str] """Return the stored container name, if any, otherwise None.""" @@ -341,17 +422,36 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do """Store the given container name.""" self.state['container_name'] = value + @property + def cgroup_path(self) -> t.Optional[str]: + """Return the path to the cgroup v1 systemd hierarchy, if any, otherwise None.""" + return self.state.get('cgroup_path') + + @cgroup_path.setter + def cgroup_path(self, value: str) -> None: + """Store the path to the cgroup v1 systemd hierarchy.""" + self.state['cgroup_path'] = value + + @property + def label(self) -> str: + """Label to apply to resources related to this profile.""" + return f'{"controller" if self.controller else "target"}-{self.args.session_name}' + def provision(self): # type: () -> None """Provision the host before delegation.""" + init_probe = self.args.dev_probe_cgroups is not None + init_config = self.get_init_config() + container = run_support_container( args=self.args, context='__test_hosts__', image=self.config.image, - name=f'ansible-test-{"controller" if self.controller else "target"}-{self.args.session_name}', + name=f'ansible-test-{self.label}', ports=[22], publish_ports=not self.controller, # connections to the controller over SSH are not required - options=self.get_docker_run_options(), + options=init_config.options, cleanup=CleanupMode.NO, + cmd=self.build_sleep_command() if init_config.command or init_probe else None, ) if not container: @@ -359,6 +459,458 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do self.container_name = container.name + try: + options = ['--pid', 'host', '--privileged'] + + if init_config.command: + init_command = init_config.command + + if not init_probe: + init_command += f' && {shlex.join(self.wake_command)}' + + cmd = ['nsenter', '-t', str(container.details.container.pid), '-m', '-p', 'sh', '-c', init_command] + run_utility_container(self.args, f'ansible-test-init-{self.label}', cmd, options) + + if init_probe: + check_container_cgroup_status(self.args, self.config, self.container_name, init_config.expected_mounts) + + cmd = ['nsenter', '-t', str(container.details.container.pid), '-m', '-p'] + self.wake_command + run_utility_container(self.args, f'ansible-test-wake-{self.label}', cmd, options) + except SubprocessError: + display.info(f'Checking container "{self.container_name}" logs...') + docker_logs(self.args, self.container_name) + + raise + + def get_init_config(self) -> InitConfig: + """Return init config for running under the current container engine.""" + self.check_cgroup_requirements() + + engine = require_docker().command + init_config = getattr(self, f'get_{engine}_init_config')() + + return init_config + + def get_podman_init_config(self) -> InitConfig: + """Return init config for running under Podman.""" + options = self.get_common_run_options() + command: t.Optional[str] = None + expected_mounts: tuple[CGroupMount, ...] + + cgroup_version = get_docker_info(self.args).cgroup_version + + # Without AUDIT_WRITE the following errors may appear in the system logs of a container after attempting to log in using SSH: + # + # fatal: linux_audit_write_entry failed: Operation not permitted + # + # This occurs when running containers as root when the container host provides audit support, but the user lacks the AUDIT_WRITE capability. + # The AUDIT_WRITE capability is provided by docker by default, but not podman. + # See: https://github.com/moby/moby/pull/7179 + # + # OpenSSH Portable requires AUDIT_WRITE when logging in with a TTY if the Linux audit feature was compiled in. + # Containers with the feature enabled will require the AUDIT_WRITE capability when EPERM is returned while accessing the audit system. + # See: https://github.com/openssh/openssh-portable/blob/2dc328023f60212cd29504fc05d849133ae47355/audit-linux.c#L90 + # See: https://github.com/openssh/openssh-portable/blob/715c892f0a5295b391ae92c26ef4d6a86ea96e8e/loginrec.c#L476-L478 + # + # Some containers will be running a patched version of OpenSSH which blocks logins when EPERM is received while using the audit system. + # These containers will require the AUDIT_WRITE capability when EPERM is returned while accessing the audit system. + # See: https://src.fedoraproject.org/rpms/openssh/blob/f36/f/openssh-7.6p1-audit.patch + # + # Since only some containers carry the patch or enable the Linux audit feature in OpenSSH, this capability is enabled on a per-container basis. + # No warning is provided when adding this capability, since there's not really anything the user can do about it. + if self.config.audit == AuditMode.REQUIRED and detect_host_properties(self.args).audit_code == 'EPERM': + options.extend(('--cap-add', 'AUDIT_WRITE')) + + # Without AUDIT_CONTROL the following errors may appear in the system logs of a container after attempting to log in using SSH: + # + # pam_loginuid(sshd:session): Error writing /proc/self/loginuid: Operation not permitted + # pam_loginuid(sshd:session): set_loginuid failed + # + # Containers configured to use the pam_loginuid module will encounter this error. If the module is required, logins will fail. + # Since most containers will have this configuration, the code to handle this issue is applied to all containers. + # + # This occurs when the loginuid is set on the container host and doesn't match the user on the container host which is running the container. + # Container hosts which do not use systemd are likely to leave the loginuid unset and thus be unaffected. + # The most common source of a mismatch is the use of sudo to run ansible-test, which changes the uid but cannot change the loginuid. + # This condition typically occurs only under podman, since the loginuid is inherited from the current user. + # See: https://github.com/containers/podman/issues/13012#issuecomment-1034049725 + # + # This condition is detected by querying the loginuid of a container running on the container host. + # When it occurs, a warning is displayed and the AUDIT_CONTROL capability is added to containers to work around the issue. + # The warning serves as notice to the user that their usage of ansible-test is responsible for the additional capability requirement. + if (loginuid := detect_host_properties(self.args).loginuid) not in (0, LOGINUID_NOT_SET, None): + display.warning(f'Running containers with capability AUDIT_CONTROL since the container loginuid ({loginuid}) is incorrect. ' + 'This is most likely due to use of sudo to run ansible-test when loginuid is already set.', unique=True) + + options.extend(('--cap-add', 'AUDIT_CONTROL')) + + if self.config.cgroup == CGroupVersion.NONE: + # Containers which do not require cgroup do not use systemd. + + options.extend(( + # Disabling systemd support in Podman will allow these containers to work on hosts without systemd. + # Without this, running a container on a host without systemd results in errors such as (from crun): + # Error: crun: error stat'ing file `/sys/fs/cgroup/systemd`: No such file or directory: + # A similar error occurs when using runc: + # OCI runtime attempted to invoke a command that was not found + '--systemd', 'false', + # A private cgroup namespace limits what is visible in /proc/*/cgroup. + '--cgroupns', 'private', + # Mounting a tmpfs overrides the cgroup mount(s) that would otherwise be provided by Podman. + # This helps provide a consistent container environment across various container host configurations. + '--tmpfs', '/sys/fs/cgroup', + )) + + expected_mounts = ( + CGroupMount(path=CGroupPath.ROOT, type=MountType.TMPFS, writable=True, state=None), + ) + elif self.config.cgroup in (CGroupVersion.V1_V2, CGroupVersion.V1_ONLY) and cgroup_version == 1: + # Podman hosts providing cgroup v1 will automatically bind mount the systemd hierarchy read-write in the container. + # They will also create a dedicated cgroup v1 systemd hierarchy for the container. + # On hosts with systemd this path is: /sys/fs/cgroup/systemd/libpod_parent/libpod-{container_id}/ + # On hosts without systemd this path is: /sys/fs/cgroup/systemd/{container_id}/ + + options.extend(( + # Force Podman to enable systemd support since a command may be used later (to support pre-init diagnostics). + '--systemd', 'always', + # The host namespace must be used to permit the container to access the cgroup v1 systemd hierarchy created by Podman. + '--cgroupns', 'host', + # Mask the host cgroup tmpfs mount to avoid exposing the host cgroup v1 hierarchies (or cgroup v2 hybrid) to the container. + # Podman will provide a cgroup v1 systemd hiearchy on top of this. + '--tmpfs', '/sys/fs/cgroup', + )) + + self.check_systemd_cgroup_v1(options) # podman + + expected_mounts = ( + CGroupMount(path=CGroupPath.ROOT, type=MountType.TMPFS, writable=True, state=None), + # The mount point can be writable or not. + # The reason for the variation is not known. + CGroupMount(path=CGroupPath.SYSTEMD, type=MountType.CGROUP_V1, writable=None, state=CGroupState.HOST), + # The filesystem type can be tmpfs or devtmpfs. + # The reason for the variation is not known. + CGroupMount(path=CGroupPath.SYSTEMD_RELEASE_AGENT, type=None, writable=False, state=None), + ) + elif self.config.cgroup in (CGroupVersion.V1_V2, CGroupVersion.V2_ONLY) and cgroup_version == 2: + # Podman hosts providing cgroup v2 will give each container a read-write cgroup mount. + + options.extend(( + # Force Podman to enable systemd support since a command may be used later (to support pre-init diagnostics). + '--systemd', 'always', + # A private cgroup namespace is used to avoid exposing the host cgroup to the container. + '--cgroupns', 'private', + )) + + expected_mounts = ( + CGroupMount(path=CGroupPath.ROOT, type=MountType.CGROUP_V2, writable=True, state=CGroupState.PRIVATE), + ) + elif self.config.cgroup == CGroupVersion.V1_ONLY and cgroup_version == 2: + # Containers which require cgroup v1 need explicit volume mounts on container hosts not providing that version. + # We must put the container PID 1 into the cgroup v1 systemd hierarchy we create. + cgroup_path = self.create_systemd_cgroup_v1() # podman + command = f'echo 1 > {cgroup_path}/cgroup.procs' + + options.extend(( + # Force Podman to enable systemd support since a command is being provided. + '--systemd', 'always', + # A private cgroup namespace is required. Using the host cgroup namespace results in errors such as the following (from crun): + # Error: OCI runtime error: mount `/sys/fs/cgroup` to '/sys/fs/cgroup': Invalid argument + # A similar error occurs when using runc: + # Error: OCI runtime error: runc create failed: unable to start container process: error during container init: + # error mounting "/sys/fs/cgroup" to rootfs at "/sys/fs/cgroup": mount /sys/fs/cgroup:/sys/fs/cgroup (via /proc/self/fd/7), flags: 0x1000: + # invalid argument + '--cgroupns', 'private', + # Unlike Docker, Podman ignores a /sys/fs/cgroup tmpfs mount, instead exposing a cgroup v2 mount. + # The exposed volume will be read-write, but the container will have its own private namespace. + # Provide a read-only cgroup v1 systemd hierarchy under which the dedicated ansible-test cgroup will be mounted read-write. + # Without this systemd will fail while attempting to mount the cgroup v1 systemd hierarchy. + # Podman doesn't support using a tmpfs for this. Attempting to do so results in an error (from crun): + # Error: OCI runtime error: read: Invalid argument + # A similar error occurs when using runc: + # Error: OCI runtime error: runc create failed: unable to start container process: error during container init: + # error mounting "tmpfs" to rootfs at "/sys/fs/cgroup/systemd": tmpcopyup: failed to copy /sys/fs/cgroup/systemd to /proc/self/fd/7 + # (/tmp/runctop3876247619/runctmpdir1460907418): read /proc/self/fd/7/cgroup.kill: invalid argument + '--volume', '/sys/fs/cgroup/systemd:/sys/fs/cgroup/systemd:ro', + # Provide the container access to the cgroup v1 systemd hierarchy created by ansible-test. + '--volume', f'{cgroup_path}:{cgroup_path}:rw', + )) + + expected_mounts = ( + CGroupMount(path=CGroupPath.ROOT, type=MountType.CGROUP_V2, writable=True, state=CGroupState.PRIVATE), + CGroupMount(path=CGroupPath.SYSTEMD, type=MountType.CGROUP_V1, writable=False, state=CGroupState.SHADOWED), + CGroupMount(path=cgroup_path, type=MountType.CGROUP_V1, writable=True, state=CGroupState.HOST), + ) + else: + raise InternalError(f'Unhandled cgroup configuration: {self.config.cgroup} on cgroup v{cgroup_version}.') + + return self.InitConfig( + options=options, + command=command, + expected_mounts=expected_mounts, + ) + + def get_docker_init_config(self) -> InitConfig: + """Return init config for running under Docker.""" + options = self.get_common_run_options() + command: t.Optional[str] = None + expected_mounts: tuple[CGroupMount, ...] + + cgroup_version = get_docker_info(self.args).cgroup_version + + if self.config.cgroup == CGroupVersion.NONE: + # Containers which do not require cgroup do not use systemd. + + if get_docker_info(self.args).cgroupns_option_supported: + # Use the `--cgroupns` option if it is supported. + # Older servers which do not support the option use the host group namespace. + # Older clients which do not support the option cause newer servers to use the host cgroup namespace (cgroup v1 only). + # See: https://github.com/moby/moby/blob/master/api/server/router/container/container_routes.go#L512-L517 + # If the host cgroup namespace is used, cgroup information will be visible, but the cgroup mounts will be unavailable due to the tmpfs below. + options.extend(( + # A private cgroup namespace limits what is visible in /proc/*/cgroup. + '--cgroupns', 'private', + )) + + options.extend(( + # Mounting a tmpfs overrides the cgroup mount(s) that would otherwise be provided by Docker. + # This helps provide a consistent container environment across various container host configurations. + '--tmpfs', '/sys/fs/cgroup', + )) + + expected_mounts = ( + CGroupMount(path=CGroupPath.ROOT, type=MountType.TMPFS, writable=True, state=None), + ) + elif self.config.cgroup in (CGroupVersion.V1_V2, CGroupVersion.V1_ONLY) and cgroup_version == 1: + # Docker hosts providing cgroup v1 will automatically bind mount the systemd hierarchy read-only in the container. + # They will also create a dedicated cgroup v1 systemd hierarchy for the container. + # The cgroup v1 system hierarchy path is: /sys/fs/cgroup/systemd/{container_id}/ + + if get_docker_info(self.args).cgroupns_option_supported: + # Use the `--cgroupns` option if it is supported. + # Older servers which do not support the option use the host group namespace. + # Older clients which do not support the option cause newer servers to use the host cgroup namespace (cgroup v1 only). + # See: https://github.com/moby/moby/blob/master/api/server/router/container/container_routes.go#L512-L517 + options.extend(( + # The host cgroup namespace must be used. + # Otherwise, /proc/1/cgroup will report "/" for the cgroup path, which is incorrect. + # See: https://github.com/systemd/systemd/issues/19245#issuecomment-815954506 + # It is set here to avoid relying on the current Docker configuration. + '--cgroupns', 'host', + )) + + options.extend(( + # Mask the host cgroup tmpfs mount to avoid exposing the host cgroup v1 hierarchies (or cgroup v2 hybrid) to the container. + '--tmpfs', '/sys/fs/cgroup', + # A cgroup v1 systemd hierarchy needs to be mounted read-write over the read-only one provided by Docker. + # Alternatives were tested, but were unusable due to various issues: + # - Attempting to remount the existing mount point read-write will result in a "mount point is busy" error. + # - Adding the entire "/sys/fs/cgroup" mount will expose hierarchies other than systemd. + # If the host is a cgroup v2 hybrid host it would also expose the /sys/fs/cgroup/unified/ hierarchy read-write. + # On older systems, such as an Ubuntu 18.04 host, a dedicated v2 cgroup would not be used, exposing the host cgroups to the container. + '--volume', '/sys/fs/cgroup/systemd:/sys/fs/cgroup/systemd:rw', + )) + + self.check_systemd_cgroup_v1(options) # docker + + expected_mounts = ( + CGroupMount(path=CGroupPath.ROOT, type=MountType.TMPFS, writable=True, state=None), + CGroupMount(path=CGroupPath.SYSTEMD, type=MountType.CGROUP_V1, writable=True, state=CGroupState.HOST), + ) + elif self.config.cgroup in (CGroupVersion.V1_V2, CGroupVersion.V2_ONLY) and cgroup_version == 2: + # Docker hosts providing cgroup v2 will give each container a read-only cgroup mount. + # It must be remounted read-write before systemd starts. + command = 'mount -o remount,rw /sys/fs/cgroup/' + + options.extend(( + # A private cgroup namespace is used to avoid exposing the host cgroup to the container. + # This matches the behavior in Podman 1.7.0 and later, which select cgroupns 'host' mode for cgroup v1 and 'private' mode for cgroup v2. + # See: https://github.com/containers/podman/pull/4374 + # See: https://github.com/containers/podman/blob/main/RELEASE_NOTES.md#170 + '--cgroupns', 'private', + )) + + expected_mounts = ( + CGroupMount(path=CGroupPath.ROOT, type=MountType.CGROUP_V2, writable=True, state=CGroupState.PRIVATE), + ) + elif self.config.cgroup == CGroupVersion.V1_ONLY and cgroup_version == 2: + # Containers which require cgroup v1 need explicit volume mounts on container hosts not providing that version. + # We must put the container PID 1 into the cgroup v1 systemd hierarchy we create. + cgroup_path = self.create_systemd_cgroup_v1() # docker + command = f'echo 1 > {cgroup_path}/cgroup.procs' + + options.extend(( + # A private cgroup namespace is used since no access to the host cgroup namespace is required. + # This matches the configuration used for running cgroup v1 containers under Podman. + '--cgroupns', 'private', + # Provide a read-write tmpfs filesystem to support additional cgroup mount points. + # Without this Docker will provide a read-only cgroup2 mount instead. + '--tmpfs', '/sys/fs/cgroup', + # Provide a read-write tmpfs filesystem to simulate a systemd cgroup v1 hierarchy. + # Without this systemd will fail while attempting to mount the cgroup v1 systemd hierarchy. + '--tmpfs', '/sys/fs/cgroup/systemd', + # Provide the container access to the cgroup v1 systemd hierarchy created by ansible-test. + '--volume', f'{cgroup_path}:{cgroup_path}:rw', + )) + + expected_mounts = ( + CGroupMount(path=CGroupPath.ROOT, type=MountType.TMPFS, writable=True, state=None), + CGroupMount(path=CGroupPath.SYSTEMD, type=MountType.TMPFS, writable=True, state=None), + CGroupMount(path=cgroup_path, type=MountType.CGROUP_V1, writable=True, state=CGroupState.HOST), + ) + else: + raise InternalError(f'Unhandled cgroup configuration: {self.config.cgroup} on cgroup v{cgroup_version}.') + + return self.InitConfig( + options=options, + command=command, + expected_mounts=expected_mounts, + ) + + def build_sleep_command(self) -> list[str]: + """ + Build and return the command to put the container to sleep. + + The sleep duration below was selected to: + + - Allow enough time to perform necessary operations in the container before waking it. + - Make the delay obvious if the wake command doesn't run or succeed. + - Avoid hanging indefinitely or for an unreasonably long time. + + NOTE: The container must have a POSIX-compliant default shell "sh" with a non-builtin "sleep" command. + """ + docker_pull(self.args, self.config.image) + inspect = docker_image_inspect(self.args, self.config.image) + + return ['sh', '-c', f'sleep 60; exec {shlex.join(inspect.cmd)}'] + + @property + def wake_command(self) -> list[str]: + """ + The command used to wake the container from sleep. + This will be run inside our utility container, so the command used does not need to be present in the container being woken up. + """ + return ['pkill', 'sleep'] + + def check_systemd_cgroup_v1(self, options: list[str]) -> None: + """Check the cgroup v1 systemd hierarchy to verify it is writeable for our container.""" + probe_script = (read_text_file(os.path.join(ANSIBLE_TEST_TARGET_ROOT, 'setup', 'check_systemd_cgroup_v1.sh')) + .replace('@MARKER@', self.MARKER) + .replace('@LABEL@', self.label)) + + cmd = ['sh'] + + try: + run_utility_container(self.args, f'ansible-test-cgroup-check-{self.label}', cmd, options, data=probe_script) + except SubprocessError as ex: + if error := self.extract_error(ex.stderr): + raise ControlGroupError(self.args, 'Unable to create a v1 cgroup within the systemd hierarchy.\n' + f'Reason: {error}') from ex # cgroup probe failed + + raise + + def create_systemd_cgroup_v1(self) -> str: + """Create a unique ansible-test cgroup in the v1 systemd hierarchy and return its path.""" + self.cgroup_path = f'/sys/fs/cgroup/systemd/ansible-test-{self.label}' + + # Privileged mode is required to create the cgroup directories on some hosts, such as Fedora 36 and RHEL 9.0. + # The mkdir command will fail with "Permission denied" otherwise. + options = ['--volume', '/sys/fs/cgroup/systemd:/sys/fs/cgroup/systemd:rw', '--privileged'] + cmd = ['sh', '-c', f'>&2 echo {shlex.quote(self.MARKER)} && mkdir {shlex.quote(self.cgroup_path)}'] + + try: + run_utility_container(self.args, f'ansible-test-cgroup-create-{self.label}', cmd, options) + except SubprocessError as ex: + if error := self.extract_error(ex.stderr): + raise ControlGroupError(self.args, f'Unable to create a v1 cgroup within the systemd hierarchy.\n' + f'Reason: {error}') from ex # cgroup create permission denied + + raise + + return self.cgroup_path + + @property + def delete_systemd_cgroup_v1_command(self) -> list[str]: + """The command used to remove the previously created ansible-test cgroup in the v1 systemd hierarchy.""" + return ['find', self.cgroup_path, '-type', 'd', '-delete'] + + def delete_systemd_cgroup_v1(self) -> None: + """Delete a previously created ansible-test cgroup in the v1 systemd hierarchy.""" + # Privileged mode is required to remove the cgroup directories on some hosts, such as Fedora 36 and RHEL 9.0. + # The BusyBox find utility will report "Permission denied" otherwise, although it still exits with a status code of 0. + options = ['--volume', '/sys/fs/cgroup/systemd:/sys/fs/cgroup/systemd:rw', '--privileged'] + cmd = ['sh', '-c', f'>&2 echo {shlex.quote(self.MARKER)} && {shlex.join(self.delete_systemd_cgroup_v1_command)}'] + + try: + run_utility_container(self.args, f'ansible-test-cgroup-delete-{self.label}', cmd, options) + except SubprocessError as ex: + if error := self.extract_error(ex.stderr): + if error.endswith(': No such file or directory'): + return + + display.error(str(ex)) + + def extract_error(self, value: str) -> t.Optional[str]: + """ + Extract the ansible-test portion of the error message from the given value and return it. + Returns None if no ansible-test marker was found. + """ + lines = value.strip().splitlines() + + try: + idx = lines.index(self.MARKER) + except ValueError: + return None + + lines = lines[idx + 1:] + message = '\n'.join(lines) + + return message + + def check_cgroup_requirements(self): + """Check cgroup requirements for the container.""" + cgroup_version = get_docker_info(self.args).cgroup_version + + if cgroup_version not in (1, 2): + raise ApplicationError(f'The container host provides cgroup v{cgroup_version}, but only version v1 and v2 are supported.') + + # Stop early for containers which require cgroup v2 when the container host does not provide it. + # None of the containers included with ansible-test currently use this configuration. + # Support for v2-only was added in preparation for the eventual removal of cgroup v1 support from systemd after EOY 2023. + # See: https://github.com/systemd/systemd/pull/24086 + if self.config.cgroup == CGroupVersion.V2_ONLY and cgroup_version != 2: + raise ApplicationError(f'Container {self.config.name} requires cgroup v2 but the container host provides cgroup v{cgroup_version}.') + + # Containers which use old versions of systemd (earlier than version 226) require cgroup v1 support. + # If the host is a cgroup v2 (unified) host, changes must be made to how the container is run. + # + # See: https://github.com/systemd/systemd/blob/main/NEWS + # Under the "CHANGES WITH 226" section: + # > systemd now optionally supports the new Linux kernel "unified" control group hierarchy. + # + # NOTE: The container host must have the cgroup v1 mount already present. + # If the container is run rootless, the user it runs under must have permissions to the mount. + # + # The following commands can be used to make the mount available: + # + # mkdir /sys/fs/cgroup/systemd + # mount cgroup -t cgroup /sys/fs/cgroup/systemd -o none,name=systemd,xattr + # chown -R {user}:{group} /sys/fs/cgroup/systemd # only when rootless + # + # See: https://github.com/containers/crun/blob/main/crun.1.md#runocisystemdforce_cgroup_v1path + if self.config.cgroup == CGroupVersion.V1_ONLY or (self.config.cgroup != CGroupVersion.NONE and get_docker_info(self.args).cgroup_version == 1): + if (cgroup_v1 := detect_host_properties(self.args).cgroup_v1) != SystemdControlGroupV1Status.VALID: + if self.config.cgroup == CGroupVersion.V1_ONLY: + if get_docker_info(self.args).cgroup_version == 2: + reason = f'Container {self.config.name} requires cgroup v1, but the container host only provides cgroup v2.' + else: + reason = f'Container {self.config.name} requires cgroup v1, but the container host does not appear to be running systemd.' + else: + reason = 'The container host provides cgroup v1, but does not appear to be running systemd.' + + reason += f'\n{cgroup_v1.value}' + + raise ControlGroupError(self.args, reason) # cgroup probe reported invalid state + def setup(self): # type: () -> None """Perform out-of-band setup before delegation.""" bootstrapper = BootstrapDocker( @@ -370,32 +922,62 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do setup_sh = bootstrapper.get_script() shell = setup_sh.splitlines()[0][2:] - docker_exec(self.args, self.container_name, [shell], data=setup_sh, capture=False) + try: + docker_exec(self.args, self.container_name, [shell], data=setup_sh, capture=False) + except SubprocessError: + display.info(f'Checking container "{self.container_name}" logs...') + docker_logs(self.args, self.container_name) + raise def deprovision(self): # type: () -> None """Deprovision the host after delegation has completed.""" - if not self.container_name: - return # provision was never called or did not succeed, so there is no container to remove - - if self.args.docker_terminate == TerminateMode.ALWAYS or (self.args.docker_terminate == TerminateMode.SUCCESS and self.args.success): - docker_rm(self.args, self.container_name) + container_exists = False + + if self.container_name: + if self.args.docker_terminate == TerminateMode.ALWAYS or (self.args.docker_terminate == TerminateMode.SUCCESS and self.args.success): + docker_rm(self.args, self.container_name) + else: + container_exists = True + + if self.cgroup_path: + if container_exists: + display.notice(f'Remember to run `{require_docker().command} rm -f {self.container_name}` when finished testing. ' + f'Then run `{shlex.join(self.delete_systemd_cgroup_v1_command)}` on the container host.') + else: + self.delete_systemd_cgroup_v1() + elif container_exists: + display.notice(f'Remember to run `{require_docker().command} rm -f {self.container_name}` when finished testing.') def wait(self): # type: () -> None """Wait for the instance to be ready. Executed before delegation for the controller and after delegation for targets.""" if not self.controller: con = self.get_controller_target_connections()[0] + last_error = '' - for dummy in range(1, 60): + for dummy in range(1, 10): try: con.run(['id'], capture=True) except SubprocessError as ex: if 'Permission denied' in ex.message: raise + last_error = str(ex) time.sleep(1) else: return + display.info('Checking SSH debug output...') + display.info(last_error) + + if not self.args.delegate and not self.args.host_path: + def callback() -> None: + """Callback to run during error display.""" + self.on_target_failure() # when the controller is not delegated, report failures immediately + else: + callback = None + + raise HostConnectionError(f'Timeout waiting for {self.config.name} container {self.container_name}.', callback) + def get_controller_target_connections(self): # type: () -> t.List[SshConnection] """Return SSH connection(s) for accessing the host as a target from the controller.""" containers = get_container_database(self.args) @@ -423,13 +1005,46 @@ class DockerProfile(ControllerHostProfile[DockerConfig], SshTargetHostProfile[Do """Return the working directory for the host.""" return '/root' - def get_docker_run_options(self): # type: () -> t.List[str] + def on_target_failure(self) -> None: + """Executed during failure handling if this profile is a target.""" + display.info(f'Checking container "{self.container_name}" logs...') + + try: + docker_logs(self.args, self.container_name) + except SubprocessError as ex: + display.error(str(ex)) + + if self.config.cgroup != CGroupVersion.NONE: + # Containers with cgroup support are assumed to be running systemd. + display.info(f'Checking container "{self.container_name}" systemd logs...') + + try: + docker_exec(self.args, self.container_name, ['journalctl'], capture=False) + except SubprocessError as ex: + display.error(str(ex)) + + display.error(f'Connection to container "{self.container_name}" failed. See logs and original error above.') + + def get_common_run_options(self) -> list[str]: """Return a list of options needed to run the container.""" options = [ - '--volume', '/sys/fs/cgroup:/sys/fs/cgroup:ro', - f'--privileged={str(self.config.privileged).lower()}', + # These temporary mount points need to be created at run time when using Docker. + # They are automatically provided by Podman, but will be overridden by VOLUME instructions for the container, if they exist. + # If supporting containers with VOLUME instructions is not desired, these options could be limited to use with Docker. + # See: https://github.com/containers/podman/pull/1318 + # Previously they were handled by the VOLUME instruction during container image creation. + # However, that approach creates anonymous volumes when running the container, which are then left behind after the container is deleted. + # These options eliminate the need for the VOLUME instruction, and override it if they are present. + # The mount options used are those typically found on Linux systems. + # Of special note is the "exec" option for "/tmp", which is required by ansible-test for path injection of executables using temporary directories. + '--tmpfs', '/tmp:exec', + '--tmpfs', '/run:exec', + '--tmpfs', '/run/lock', # some systemd containers require a separate tmpfs here, such as Ubuntu 20.04 and Ubuntu 22.04 ] + if self.config.privileged: + options.append('--privileged') + if self.config.memory: options.extend([ f'--memory={self.config.memory}', @@ -499,7 +1114,7 @@ class NetworkRemoteProfile(RemoteProfile[NetworkRemoteConfig]): else: return - raise ApplicationError(f'Timeout waiting for {self.config.name} instance {core_ci.instance_id}.') + raise HostConnectionError(f'Timeout waiting for {self.config.name} instance {core_ci.instance_id}.') def get_controller_target_connections(self): # type: () -> t.List[SshConnection] """Return SSH connection(s) for accessing the host as a target from the controller.""" @@ -573,16 +1188,11 @@ class PosixRemoteProfile(ControllerHostProfile[PosixRemoteConfig], RemoteProfile if settings.user == 'root': become = None # type: t.Optional[Become] - elif self.config.platform == 'freebsd': - become = Su() - elif self.config.platform == 'macos': - become = Sudo() - elif self.config.platform == 'rhel': - become = Sudo() - elif self.config.platform == 'ubuntu': - become = Sudo() + elif self.config.become: + become = SUPPORTED_BECOME_METHODS[self.config.become]() else: - raise NotImplementedError(f'Become support has not been implemented for platform "{self.config.platform}" and user "{settings.user}" is not root.') + display.warning(f'Defaulting to "sudo" for platform "{self.config.platform}" become support.', unique=True) + become = Sudo() return SshConnection(self.args, settings, become) @@ -594,12 +1204,12 @@ class PosixRemoteProfile(ControllerHostProfile[PosixRemoteConfig], RemoteProfile try: return self.get_working_directory() except SubprocessError as ex: - if 'Permission denied' in ex.message: - raise - + # No "Permission denied" check is performed here. + # Unlike containers, with remote instances, user configuration isn't guaranteed to have been completed before SSH connections are attempted. + display.warning(str(ex)) time.sleep(10) - raise ApplicationError(f'Timeout waiting for {self.config.name} instance {core_ci.instance_id}.') + raise HostConnectionError(f'Timeout waiting for {self.config.name} instance {core_ci.instance_id}.') def get_controller_target_connections(self): # type: () -> t.List[SshConnection] """Return SSH connection(s) for accessing the host as a target from the controller.""" @@ -735,7 +1345,7 @@ class WindowsRemoteProfile(RemoteProfile[WindowsRemoteConfig]): else: return - raise ApplicationError(f'Timeout waiting for {self.config.name} instance {core_ci.instance_id}.') + raise HostConnectionError(f'Timeout waiting for {self.config.name} instance {core_ci.instance_id}.') def get_controller_target_connections(self): # type: () -> t.List[SshConnection] """Return SSH connection(s) for accessing the host as a target from the controller.""" diff --git a/test/lib/ansible_test/_internal/provisioning.py b/test/lib/ansible_test/_internal/provisioning.py index 85d91326f1..e61a023094 100644 --- a/test/lib/ansible_test/_internal/provisioning.py +++ b/test/lib/ansible_test/_internal/provisioning.py @@ -18,6 +18,7 @@ from .config import ( from .util import ( ApplicationError, + HostConnectionError, display, open_binary_file, verify_sys_executable, @@ -184,13 +185,26 @@ def dispatch_jobs(jobs): # type: (t.List[t.Tuple[HostProfile, WrappedThread]]) time.sleep(1) failed = False + connection_failures = 0 for profile, thread in jobs: try: thread.wait_for_result() + except HostConnectionError as ex: + display.error(f'Host {profile.config} connection failed:\n{ex}') + failed = True + connection_failures += 1 + except ApplicationError as ex: + display.error(f'Host {profile.config} job failed:\n{ex}') + failed = True except Exception as ex: # pylint: disable=broad-except - display.error(f'Host {profile} job failed: {ex}\n{"".join(traceback.format_tb(ex.__traceback__))}') + name = f'{"" if ex.__class__.__module__ == "builtins" else ex.__class__.__module__ + "."}{ex.__class__.__qualname__}' + display.error(f'Host {profile.config} job failed:\nTraceback (most recent call last):\n' + f'{"".join(traceback.format_tb(ex.__traceback__)).rstrip()}\n{name}: {ex}') failed = True + if connection_failures: + raise HostConnectionError(f'Host job(s) failed, including {connection_failures} connection failure(s). See previous error(s) for details.') + if failed: raise ApplicationError('Host job(s) failed. See previous error(s) for details.') diff --git a/test/lib/ansible_test/_internal/target.py b/test/lib/ansible_test/_internal/target.py index 3962b95f3b..6b29605d92 100644 --- a/test/lib/ansible_test/_internal/target.py +++ b/test/lib/ansible_test/_internal/target.py @@ -702,6 +702,8 @@ class IntegrationTarget(CompletionTarget): # configuration + self.retry_never = 'retry/never/' in self.aliases + self.setup_once = tuple(sorted(set(g.split('/')[2] for g in groups if g.startswith('setup/once/')))) self.setup_always = tuple(sorted(set(g.split('/')[2] for g in groups if g.startswith('setup/always/')))) self.needs_target = tuple(sorted(set(g.split('/')[2] for g in groups if g.startswith('needs/target/')))) diff --git a/test/lib/ansible_test/_internal/thread.py b/test/lib/ansible_test/_internal/thread.py index f74b365d96..601f60e44d 100644 --- a/test/lib/ansible_test/_internal/thread.py +++ b/test/lib/ansible_test/_internal/thread.py @@ -1,6 +1,8 @@ """Python threading tools.""" from __future__ import annotations +import collections.abc as c +import contextlib import functools import sys import threading @@ -59,3 +61,25 @@ def mutex(func): # type: (TCallable) -> TCallable return func(*args, **kwargs) return wrapper # type: ignore[return-value] # requires https://www.python.org/dev/peps/pep-0612/ support + + +__named_lock = threading.Lock() +__named_locks: dict[str, threading.Lock] = {} + + +@contextlib.contextmanager +def named_lock(name: str) -> c.Iterator[bool]: + """ + Context manager that provides named locks using threading.Lock instances. + Once named lock instances are created they are not deleted. + Returns True if this is the first instance of the named lock, otherwise False. + """ + with __named_lock: + if lock_instance := __named_locks.get(name): + first = False + else: + first = True + lock_instance = __named_locks[name] = threading.Lock() + + with lock_instance: + yield first diff --git a/test/lib/ansible_test/_internal/util.py b/test/lib/ansible_test/_internal/util.py index 60901ef507..fb125200b0 100644 --- a/test/lib/ansible_test/_internal/util.py +++ b/test/lib/ansible_test/_internal/util.py @@ -668,6 +668,16 @@ def pass_vars(required, optional): # type: (t.Collection[str], t.Collection[str return env +def verified_chmod(path: str, mode: int) -> None: + """Perform chmod on the specified path and then verify the permissions were applied.""" + os.chmod(path, mode) # pylint: disable=ansible-bad-function + + executable = any(mode & perm for perm in (stat.S_IXUSR, stat.S_IXGRP, stat.S_IXOTH)) + + if executable and not os.access(path, os.X_OK): + raise ApplicationError(f'Path "{path}" should executable, but is not. Is the filesystem mounted with the "noexec" option?') + + def remove_tree(path): # type: (str) -> None """Remove the specified directory, siliently continuing if the directory does not exist.""" try: @@ -921,6 +931,23 @@ class MissingEnvironmentVariable(ApplicationError): self.name = name +class HostConnectionError(ApplicationError): + """ + Raised when the initial connection during host profile setup has failed and all retries have been exhausted. + Raised by provisioning code when one or more provisioning threads raise this exception. + Also raised when an SSH connection fails for the shell command. + """ + def __init__(self, message: str, callback: t.Callable[[], None] = None) -> None: + super().__init__(message) + + self._callback = callback + + def run_callback(self) -> None: + """Run the error callback, if any.""" + if self._callback: + self._callback() + + def retry(func, ex_type=SubprocessError, sleep=10, attempts=10, warn=True): """Retry the specified function on failure.""" for dummy in range(1, attempts): diff --git a/test/lib/ansible_test/_internal/util_common.py b/test/lib/ansible_test/_internal/util_common.py index f0a9d7a3f9..ecf8ae6676 100644 --- a/test/lib/ansible_test/_internal/util_common.py +++ b/test/lib/ansible_test/_internal/util_common.py @@ -37,6 +37,7 @@ from .util import ( ApplicationError, SubprocessError, generate_name, + verified_chmod, ) from .io import ( @@ -282,9 +283,9 @@ def get_injector_path(): # type: () -> str script = set_shebang(script, shebang) write_text_file(dst, script) - os.chmod(dst, mode) + verified_chmod(dst, mode) - os.chmod(injector_path, MODE_DIRECTORY) + verified_chmod(injector_path, MODE_DIRECTORY) def cleanup_injector(): """Remove the temporary injector directory.""" @@ -345,7 +346,7 @@ def get_python_path(interpreter): # type: (str) -> str create_interpreter_wrapper(interpreter, injected_interpreter) - os.chmod(python_path, MODE_DIRECTORY) + verified_chmod(python_path, MODE_DIRECTORY) if not PYTHON_PATHS: atexit.register(cleanup_python_paths) @@ -383,7 +384,7 @@ def create_interpreter_wrapper(interpreter, injected_interpreter): # type: (str write_text_file(injected_interpreter, code) - os.chmod(injected_interpreter, MODE_FILE_EXECUTE) + verified_chmod(injected_interpreter, MODE_FILE_EXECUTE) def cleanup_python_paths(): diff --git a/test/lib/ansible_test/_util/controller/sanity/pylint/config/ansible-test.cfg b/test/lib/ansible_test/_util/controller/sanity/pylint/config/ansible-test.cfg index 76200eb87f..1ebda6db03 100644 --- a/test/lib/ansible_test/_util/controller/sanity/pylint/config/ansible-test.cfg +++ b/test/lib/ansible_test/_util/controller/sanity/pylint/config/ansible-test.cfg @@ -7,6 +7,7 @@ disable= no-self-use, raise-missing-from, # Python 2.x does not support raise from too-few-public-methods, + too-many-public-methods, too-many-arguments, too-many-branches, too-many-instance-attributes, diff --git a/test/lib/ansible_test/_util/controller/sanity/pylint/plugins/unwanted.py b/test/lib/ansible_test/_util/controller/sanity/pylint/plugins/unwanted.py index 6ef5dc2ac9..1be42f51f2 100644 --- a/test/lib/ansible_test/_util/controller/sanity/pylint/plugins/unwanted.py +++ b/test/lib/ansible_test/_util/controller/sanity/pylint/plugins/unwanted.py @@ -21,11 +21,13 @@ class UnwantedEntry: modules_only=False, # type: bool names=None, # type: t.Optional[t.Tuple[str, ...]] ignore_paths=None, # type: t.Optional[t.Tuple[str, ...]] + ansible_test_only=False, # type: bool ): # type: (...) -> None self.alternative = alternative self.modules_only = modules_only self.names = set(names) if names else set() self.ignore_paths = ignore_paths + self.ansible_test_only = ansible_test_only def applies_to(self, path, name=None): # type: (str, t.Optional[str]) -> bool """Return True if this entry applies to the given path, otherwise return False.""" @@ -39,6 +41,9 @@ class UnwantedEntry: if self.ignore_paths and any(path.endswith(ignore_path) for ignore_path in self.ignore_paths): return False + if self.ansible_test_only and '/test/lib/ansible_test/_internal/' not in path: + return False + if self.modules_only: return is_module_path(path) @@ -114,6 +119,10 @@ class AnsibleUnwantedChecker(BaseChecker): # see https://docs.python.org/3/library/tempfile.html#tempfile.mktemp 'tempfile.mktemp': UnwantedEntry('tempfile.mkstemp'), + # os.chmod resolves as posix.chmod + 'posix.chmod': UnwantedEntry('verified_chmod', + ansible_test_only=True), + 'sys.exit': UnwantedEntry('exit_json or fail_json', ignore_paths=( '/lib/ansible/module_utils/basic.py', diff --git a/test/lib/ansible_test/_util/target/setup/bootstrap.sh b/test/lib/ansible_test/_util/target/setup/bootstrap.sh index 80ad16c02a..3494c7e982 100644 --- a/test/lib/ansible_test/_util/target/setup/bootstrap.sh +++ b/test/lib/ansible_test/_util/target/setup/bootstrap.sh @@ -80,6 +80,69 @@ pip_install() { done } +bootstrap_remote_alpine() +{ + py_pkg_prefix="py3" + + packages=" + acl + bash + gcc + python3-dev + ${py_pkg_prefix}-pip + sudo + " + + if [ "${controller}" ]; then + packages=" + ${packages} + ${py_pkg_prefix}-cryptography + ${py_pkg_prefix}-packaging + ${py_pkg_prefix}-yaml + ${py_pkg_prefix}-jinja2 + ${py_pkg_prefix}-resolvelib + " + fi + + while true; do + # shellcheck disable=SC2086 + apk add -q ${packages} \ + && break + echo "Failed to install packages. Sleeping before trying again..." + sleep 10 + done +} + +bootstrap_remote_fedora() +{ + py_pkg_prefix="python3" + + packages=" + acl + gcc + ${py_pkg_prefix}-devel + " + + if [ "${controller}" ]; then + packages=" + ${packages} + ${py_pkg_prefix}-cryptography + ${py_pkg_prefix}-jinja2 + ${py_pkg_prefix}-packaging + ${py_pkg_prefix}-pyyaml + ${py_pkg_prefix}-resolvelib + " + fi + + while true; do + # shellcheck disable=SC2086 + dnf install -q -y ${packages} \ + && break + echo "Failed to install packages. Sleeping before trying again..." + sleep 10 + done +} + bootstrap_remote_freebsd() { if [ "${python_version}" = "2.7" ]; then @@ -150,6 +213,19 @@ bootstrap_remote_freebsd() extra-index-url = https://spare-tire.testing.ansible.com/simple/ prefer-binary = yes " > /etc/pip.conf + + # enable ACL support on the root filesystem (required for become between unprivileged users) + fs_path="/" + fs_device="$(mount -v "${fs_path}" | cut -w -f 1)" + # shellcheck disable=SC2001 + fs_device_escaped=$(echo "${fs_device}" | sed 's|/|\\/|g') + + mount -o acls "${fs_device}" "${fs_path}" + awk 'BEGIN{FS=" "}; /'"${fs_device_escaped}"'/ {gsub(/^rw$/,"rw,acls", $4); print; next} // {print}' /etc/fstab > /etc/fstab.new + mv /etc/fstab.new /etc/fstab + + # enable sudo without a password for the wheel group, allowing ansible to use the sudo become plugin + echo '%wheel ALL=(ALL:ALL) NOPASSWD: ALL' > /usr/local/etc/sudoers.d/ansible-test } bootstrap_remote_macos() @@ -286,21 +362,37 @@ bootstrap_remote_ubuntu() py_pkg_prefix="python3" packages=" + acl gcc - ${py_pkg_prefix}-dev - ${py_pkg_prefix}-pip - ${py_pkg_prefix}-venv + python${python_version}-dev + python3-pip + python${python_version}-venv " if [ "${controller}" ]; then - # The resolvelib package is not listed here because the available version (0.8.1) is incompatible with ansible. - # Instead, ansible-test will install it using pip. + cryptography_pkg="${py_pkg_prefix}-cryptography" + jinja2_pkg="${py_pkg_prefix}-jinja2" + packaging_pkg="${py_pkg_prefix}-packaging" + pyyaml_pkg="${py_pkg_prefix}-yaml" + resolvelib_pkg="${py_pkg_prefix}-resolvelib" + + # Declare platforms which do not have supporting OS packages available. + # For these ansible-test will use pip to install the requirements instead. + # Only the platform is checked since Ubuntu shares Python packages across Python versions. + case "${platform_version}" in + "20.04") + jinja2_pkg="" # too old + resolvelib_pkg="" # not available + ;; + esac + packages=" ${packages} - ${py_pkg_prefix}-cryptography - ${py_pkg_prefix}-jinja2 - ${py_pkg_prefix}-packaging - ${py_pkg_prefix}-yaml + ${cryptography_pkg} + ${jinja2_pkg} + ${packaging_pkg} + ${pyyaml_pkg} + ${resolvelib_pkg} " fi @@ -312,6 +404,14 @@ bootstrap_remote_ubuntu() echo "Failed to install packages. Sleeping before trying again..." sleep 10 done + + if [ "${controller}" ]; then + if [ "${platform_version}/${python_version}" = "20.04/3.9" ]; then + # Install pyyaml using pip so libyaml support is available on Python 3.9. + # The OS package install (which is installed by default) only has a .so file for Python 3.8. + pip_install "--upgrade pyyaml" + fi + fi } bootstrap_docker() @@ -329,6 +429,8 @@ bootstrap_remote() python_package_version="$(echo "${python_version}" | tr -d '.')" case "${platform}" in + "alpine") bootstrap_remote_alpine ;; + "fedora") bootstrap_remote_fedora ;; "freebsd") bootstrap_remote_freebsd ;; "macos") bootstrap_remote_macos ;; "rhel") bootstrap_remote_rhel ;; @@ -345,6 +447,9 @@ bootstrap() install_ssh_keys customize_bashrc + # allow tests to detect ansible-test bootstrapped instances, as well as the bootstrap type + echo "${bootstrap_type}" > /etc/ansible-test.bootstrap + case "${bootstrap_type}" in "docker") bootstrap_docker ;; "remote") bootstrap_remote ;; diff --git a/test/lib/ansible_test/_util/target/setup/check_systemd_cgroup_v1.sh b/test/lib/ansible_test/_util/target/setup/check_systemd_cgroup_v1.sh new file mode 100644 index 0000000000..3b05a3f444 --- /dev/null +++ b/test/lib/ansible_test/_util/target/setup/check_systemd_cgroup_v1.sh @@ -0,0 +1,17 @@ +# shellcheck shell=sh + +set -eu + +>&2 echo "@MARKER@" + +cgroup_path="$(awk -F: '$2 ~ /^name=systemd$/ { print "/sys/fs/cgroup/systemd"$3 }' /proc/1/cgroup)" + +if [ "${cgroup_path}" ] && [ -d "${cgroup_path}" ]; then + probe_path="${cgroup_path%/}/ansible-test-probe-@LABEL@" + mkdir "${probe_path}" + rmdir "${probe_path}" + exit 0 +fi + +>&2 echo "No systemd cgroup v1 hierarchy found" +exit 1 diff --git a/test/lib/ansible_test/_util/target/setup/probe_cgroups.py b/test/lib/ansible_test/_util/target/setup/probe_cgroups.py new file mode 100644 index 0000000000..2ac7ecb084 --- /dev/null +++ b/test/lib/ansible_test/_util/target/setup/probe_cgroups.py @@ -0,0 +1,31 @@ +"""A tool for probing cgroups to determine write access.""" +from __future__ import (absolute_import, division, print_function) +__metaclass__ = type + +import json +import os +import sys + + +def main(): # type: () -> None + """Main program entry point.""" + probe_dir = sys.argv[1] + paths = sys.argv[2:] + results = {} + + for path in paths: + probe_path = os.path.join(path, probe_dir) + + try: + os.mkdir(probe_path) + os.rmdir(probe_path) + except Exception as ex: # pylint: disable=broad-except + results[path] = str(ex) + else: + results[path] = None + + print(json.dumps(results, sort_keys=True)) + + +if __name__ == '__main__': + main() diff --git a/test/utils/shippable/aix.sh b/test/utils/shippable/aix.sh deleted file mode 120000 index 6ddb776854..0000000000 --- a/test/utils/shippable/aix.sh +++ /dev/null @@ -1 +0,0 @@ -remote.sh \ No newline at end of file diff --git a/test/utils/shippable/alpine.sh b/test/utils/shippable/alpine.sh new file mode 120000 index 0000000000..6ddb776854 --- /dev/null +++ b/test/utils/shippable/alpine.sh @@ -0,0 +1 @@ +remote.sh \ No newline at end of file diff --git a/test/utils/shippable/fedora.sh b/test/utils/shippable/fedora.sh new file mode 120000 index 0000000000..6ddb776854 --- /dev/null +++ b/test/utils/shippable/fedora.sh @@ -0,0 +1 @@ +remote.sh \ No newline at end of file diff --git a/test/utils/shippable/ubuntu.sh b/test/utils/shippable/ubuntu.sh new file mode 120000 index 0000000000..6ddb776854 --- /dev/null +++ b/test/utils/shippable/ubuntu.sh @@ -0,0 +1 @@ +remote.sh \ No newline at end of file -- cgit v1.2.1