From e01030633c73d3974390292bba381aca1224709b Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Wed, 30 Jun 2021 15:51:03 +0100 Subject: core: when recursively bind-remounting nested mounts, use options from top one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mount points are stacked, bind_remount_recursive_with_mountinfo() uses the existing mount options of the "lower" level mount (ie: the first one that was mounted on a mount point). But the actual mount point in use is the "top" one (ie: the last one that was mounted on a mount point), so in practice if the mount options are different between the layers, the bottom options are used by mistake on the top mount, which is not what we want. This is because libmount returns the "bottom" one first. If the hashmap returns EEXIST, which means the same key (path) with different value (options) is already present, update the hashmap instead of discarding the result. This way, the last/top mount options are always used when mounts are stacked on a mount point. This was found to cause problems as LXC version 4.x stacks two /sys mounts, the bottom one read-write and the top one read-only. systemd accidentally remounts the top-one read-write, breaking various expectations since a read-only /sys is the way we decide whether we are running in a container or not (in this particular case, networkd tests are broken as networkd expects to be able to modify network settings with a writable /sys). Future versions of LXC will no longer do this double-stacking, but we need to support running inside older versions too. This was triggered by https://github.com/systemd/systemd/commit/6720e356c137 as that causes a recursive remount of '/', which processes '/sys' as one of the submounts, from make_nosuid(). But it's likely that other combinations of options could trigger this as well. Before: root@systemd-debug:/# systemd-run -t --wait --property ProtectSystem=yes findmnt Running as unit: run-u9.service Press ^] three times within 1s to disconnect TTY. TARGET SOURCE FSTYPE OPTIONS / /dev/sda2[/var/lib/lxc/systemd-debug/rootfs] │ ext4 ro,nosuid,relatime,errors=remount-ro,stripe= ├─/dev none tmpfs rw,nosuid,relatime,size=492k,mode=755 │ ├─/dev/.lxc/proc proc proc rw,nosuid,relatime │ ├─/dev/.lxc/sys sys sysfs rw,nosuid,relatime │ ├─/dev/console devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/pts devpts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/ptmx devpts[/ptmx] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/tty1 devpts[/0] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/tty2 devpts[/1] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/tty3 devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/tty4 devpts[/3] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptm │ ├─/dev/shm tmpfs tmpfs rw,nosuid,nodev │ ├─/dev/hugepages hugetlbfs hugetlbfs rw,nosuid,relatime,pagesize=2M │ └─/dev/mqueue mqueue mqueue rw,nosuid,nodev,noexec,relatime ├─/proc proc proc rw,nosuid,nodev,noexec,relatime │ ├─/proc/sys proc[/sys] proc ro,nosuid,nodev,noexec,relatime │ │ ├─/proc/sys/net proc[/sys/net] proc rw,nosuid,nodev,noexec,relatime │ │ └─/proc/sys/kernel/random/boot_id │ │ none[/.lxc-boot-id] tmpfs ro,nosuid,nodev,noexec,relatime,size=492k,mo │ └─/proc/sysrq-trigger proc[/sysrq-trigger] proc ro,nosuid,nodev,noexec,relatime ├─/sys sysfs sysfs rw,nosuid,nodev,noexec,relatime │ └─/sys sysfs sysfs rw,nosuid,nodev,noexec,relatime │ ├─/sys/devices/virtual/net sysfs sysfs rw,relatime │ │ └─/sys/devices/virtual/net │ │ sysfs[/devices/virtual/net] sysfs rw,nosuid,relatime │ ├─/sys/fs/fuse/connections fusectl fusectl rw,nosuid,nodev,noexec,relatime │ └─/sys/fs/cgroup cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,m ├─/run tmpfs tmpfs ro,nosuid,nodev,size=4912348k,nr_inodes=8192 │ ├─/run/credentials tmpfs[/systemd/inaccessible/dir] tmpfs ro,nosuid,nodev,noexec,size=4912348k,nr_inod │ └─/run/systemd/incoming tmpfs[/systemd/propagate/run-u9.service] │ tmpfs ro,nosuid,nodev,size=4912348k,nr_inodes=8192 ├─/tmp tmpfs tmpfs rw,nosuid,nodev,size=12280872k,nr_inodes=409 │ └─/tmp tmpfs[/systemd-private-b730df90da424397a3f246cb15dcdbb1-run-u9.service-K6EUwf/tmp] │ tmpfs rw,nosuid,nodev,size=12280872k,nr_inodes=409 └─/var/tmp /dev/sda2[/var/lib/lxc/systemd-debug/rootfs/var/tmp/systemd-private-b730df90da424397a3f246cb15dcdbb1-run-u9.service-vEHyRi/tmp] ext4 rw,nosuid,relatime,errors=remount-ro,stripe= Finished with result: success Main processes terminated with: code=exited/status=0 Service runtime: 14.249s CPU time consumed: 37ms After: root@systemd-debug:/# systemd-run -t --wait --property ProtectSystem=yes findmnt Running as unit: run-u3.service Press ^] three times within 1s to disconnect TTY. TARGET SOURCE FSTYPE OPTIONS / /dev/sda2[/var/lib/lxc/systemd-debug/rootfs] │ ext4 rw,relatime,errors=remount-ro,stripe=32699 ├─/dev none tmpfs rw,relatime,size=492k,mode=755 │ ├─/dev/.lxc/proc proc proc rw,relatime │ ├─/dev/.lxc/sys sys sysfs rw,relatime │ ├─/dev/console devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/pts devpts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/ptmx devpts[/ptmx] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/tty1 devpts[/0] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/tty2 devpts[/1] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/tty3 devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/tty4 devpts[/3] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode │ ├─/dev/shm tmpfs tmpfs rw,nosuid,nodev │ ├─/dev/hugepages hugetlbfs hugetlbfs rw,relatime,pagesize=2M │ └─/dev/mqueue mqueue mqueue rw,nosuid,nodev,noexec,relatime ├─/proc proc proc rw,nosuid,nodev,noexec,relatime │ ├─/proc/sys proc[/sys] proc ro,nosuid,nodev,noexec,relatime │ │ ├─/proc/sys/net proc[/sys/net] proc rw,nosuid,nodev,noexec,relatime │ │ └─/proc/sys/kernel/random/boot_id │ │ none[/.lxc-boot-id] tmpfs ro,nosuid,nodev,noexec,relatime,size=492k,mode=75 │ └─/proc/sysrq-trigger proc[/sysrq-trigger] proc ro,nosuid,nodev,noexec,relatime ├─/sys sysfs sysfs rw,nosuid,nodev,noexec,relatime │ └─/sys sysfs sysfs ro,nosuid,nodev,noexec,relatime │ ├─/sys/devices/virtual/net sysfs sysfs rw,relatime │ │ └─/sys/devices/virtual/net │ │ sysfs[/devices/virtual/net] sysfs rw,nosuid,nodev,noexec,relatime │ ├─/sys/fs/fuse/connections fusectl fusectl rw,nosuid,nodev,noexec,relatime │ └─/sys/fs/cgroup cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory ├─/run tmpfs tmpfs rw,nosuid,nodev,size=4912348k,nr_inodes=819200,mo │ ├─/run/credentials tmpfs[/systemd/inaccessible/dir] │ │ tmpfs ro,nosuid,nodev,noexec,size=4912348k,nr_inodes=81 │ └─/run/systemd/incoming tmpfs[/systemd/propagate/run-u3.service] │ tmpfs ro,nosuid,nodev,size=4912348k,nr_inodes=819200,mo ├─/tmp tmpfs tmpfs rw,nosuid,nodev,size=12280872k,nr_inodes=409600 ├─/boot /dev/sda2[/var/lib/lxc/systemd-debug/rootfs/boot] │ ext4 ro,relatime,errors=remount-ro,stripe=32699 └─/usr /dev/sda2[/var/lib/lxc/systemd-debug/rootfs/usr] ext4 ro,relatime,errors=remount-ro,stripe=32699 Finished with result: success Main processes terminated with: code=exited/status=0 Service runtime: 14ms CPU time consumed: 5ms Host (LXC): root@systemd-debug:/# findmnt TARGET SOURCE FSTYPE OPTIONS / /dev/sda2[/var/lib/lxc/systemd-debug/rootfs] │ ext4 rw,relatime,errors=remount-ro,stripe=32699 ├─/run tmpfs tmpfs rw,nosuid,nodev,size=4912348k,nr_inodes=819200,mode=755 ├─/tmp tmpfs tmpfs rw,nosuid,nodev,size=12280872k,nr_inodes=409600 ├─/dev none tmpfs rw,relatime,size=492k,mode=755 │ ├─/dev/pts devpts devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/ptmx devpts[/ptmx] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/tty1 devpts[/0] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/tty2 devpts[/1] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/tty3 devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/tty4 devpts[/3] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=666,ma │ ├─/dev/shm tmpfs tmpfs rw,nosuid,nodev │ ├─/dev/hugepages hugetlbfs hugetlbfs rw,relatime,pagesize=2M │ ├─/dev/mqueue mqueue mqueue rw,nosuid,nodev,noexec,relatime │ ├─/dev/console devpts[/2] devpts rw,nosuid,noexec,relatime,gid=5,mode=620,ptmxmode=000 │ ├─/dev/.lxc/proc proc proc rw,relatime │ └─/dev/.lxc/sys sys sysfs rw,relatime ├─/proc proc proc rw,nosuid,nodev,noexec,relatime │ ├─/proc/sys proc[/sys] proc ro,nosuid,nodev,noexec,relatime │ │ ├─/proc/sys/kernel/random/boot_id │ │ │ none[/.lxc-boot-id] tmpfs ro,nosuid,nodev,noexec,relatime,size=492k,mode=755 │ │ └─/proc/sys/net proc[/sys/net] proc rw,nosuid,nodev,noexec,relatime │ └─/proc/sysrq-trigger proc[/sysrq-trigger] proc ro,nosuid,nodev,noexec,relatime └─/sys sysfs sysfs rw,nosuid,nodev,noexec,relatime └─/sys sysfs sysfs ro,nosuid,nodev,noexec,relatime ├─/sys/devices/virtual/net sysfs sysfs rw,relatime │ └─/sys/devices/virtual/net │ sysfs[/devices/virtual/net] │ sysfs rw,nosuid,nodev,noexec,relatime ├─/sys/fs/fuse/connections fusectl fusectl rw,nosuid,nodev,noexec,relatime └─/sys/fs/cgroup cgroup cgroup2 rw,nosuid,nodev,noexec,relatime,nsdelegate,memory_recurs Fixes https://github.com/systemd/systemd/issues/20032 --- src/shared/mount-util.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'src/shared/mount-util.c') diff --git a/src/shared/mount-util.c b/src/shared/mount-util.c index ff95fbc569..594efea989 100644 --- a/src/shared/mount-util.c +++ b/src/shared/mount-util.c @@ -251,7 +251,13 @@ int bind_remount_recursive_with_mountinfo( r = hashmap_ensure_put(&todo, &path_hash_ops_free, d, ULONG_TO_PTR(flags)); if (r == -EEXIST) - continue; + /* If the same path was recorded, but with different mount flags, update it: + * it means a mount point is overmounted, and libmount returns the "bottom" (or + * older one) first, but we want to reapply the flags from the "top" (or newer + * one). See: https://github.com/systemd/systemd/issues/20032 + * Note that this shouldn't really fail, as we were just told that the key + * exists, and it's an update so we want 'd' to be freed immediately. */ + r = hashmap_update(todo, d, ULONG_TO_PTR(flags)); if (r < 0) return r; if (r > 0) -- cgit v1.2.1