summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLennart Poettering <lennart@poettering.net>2023-01-06 15:21:27 +0100
committerGitHub <noreply@github.com>2023-01-06 15:21:27 +0100
commit7122aee5ab12376926362cbe6eeeaa3b144b9eed (patch)
treefe9dbbbda4966015ec19958815a59263cecc35ae
parent48d85160438827c1424104315547f960ec0d6e43 (diff)
parent03ede612beb6f401cf433e90988b21aa57788bb3 (diff)
downloadsystemd-7122aee5ab12376926362cbe6eeeaa3b144b9eed.tar.gz
Merge pull request #25918 from bluca/smbios_sd_notify
Support AF_VSOCK in sd_notify and pick up notify_socket from creds
-rw-r--r--TODO12
-rw-r--r--docs/CREDENTIALS.md25
-rw-r--r--man/sd_notify.xml27
-rw-r--r--man/systemd.system-credentials.xml18
-rw-r--r--src/basic/socket-util.c68
-rw-r--r--src/basic/socket-util.h6
-rw-r--r--src/core/import-creds.c13
-rw-r--r--src/libsystemd/sd-daemon/sd-daemon.c67
-rw-r--r--src/shared/socket-netlink.c158
9 files changed, 266 insertions, 128 deletions
diff --git a/TODO b/TODO
index f30750550f..3eed1f53ad 100644
--- a/TODO
+++ b/TODO
@@ -552,10 +552,6 @@ Features:
* sd-boot should look for information what to boot in SMBIOS, too, so that VM
managers can tell sd-boot what to boot into and suchlike
-* PID 1 should look for an SMBIOS variable that encodes an AF_VSOCK address it
- should send sd_notify() ready notifications to. That way a VMM can boot up a
- system, and generically know when it finished booting.
-
* add "systemd-sysext identify" verb, that you can point on any file in /usr/
and that determines from which overlayfs layer it originates, which image, and with
what it was signed.
@@ -777,13 +773,7 @@ Features:
don't query this unnecessarily in entirely uninitialized
containers. (i.e. containers with empty /etc).
-* beef up sd_notify() to support AV_VSOCK in $NOTIFY_SOCKET, so that VM
- managers can get ready notifications from VMs, just like container managers
- from their payload. Also pick up address from qemu/fw_cfg if set there.
- (which has benefits, given SecureBoot and kernel cmdline are not necessarily
- friends.)
-
-* mirroring this: maybe support binding to AV_VSOCK in Type=notify services,
+* sd_notify/vsock: maybe support binding to AF_VSOCK in Type=notify services,
then passing $NOTIFY_SOCKET and $NOTIFY_GUESTCID with PID1's cid (typically
fixed to "2", i.e. the official host cid) and the expected guest cid, for the
two sides of the channel. The latter env var could then be used in an
diff --git a/docs/CREDENTIALS.md b/docs/CREDENTIALS.md
index 9e16dd3ba4..debe0a714f 100644
--- a/docs/CREDENTIALS.md
+++ b/docs/CREDENTIALS.md
@@ -330,6 +330,18 @@ systemd-run -p LoadCredential=mycred -P --wait systemd-creds cat mycred
Various services shipped with `systemd` consume credentials for tweaking behaviour:
+* [`systemd(1)`](https://www.freedesktop.org/software/systemd/man/systemd.html)
+ (I.E.: PID1, the system manager) will look for the credential `vmm.notify_socket`
+ and will use it to send a `READY=1` datagram when the system has finished
+ booting. This is useful for hypervisors/VMMs or other processes on the host
+ to receive a notification via VSOCK when a virtual machine has finished booting.
+ Note that in case the hypervisor does not support `SOCK_DGRAM` over `AF_VSOCK`,
+ `SOCK_SEQPACKET` will be tried instead. The credential payload should be in the
+ form: `vsock:<CID>:<PORT>`, where `<CID>` is optional and if omitted will
+ default to talking to the hypervisor (`0`). Also note that this requires
+ support for VHOST to be built-in both the guest and the host kernels, and the
+ kernel modules to be loaded.
+
* [`systemd-sysusers(8)`](https://www.freedesktop.org/software/systemd/man/systemd-sysusers.html)
will look for the credentials `passwd.hashed-password.<username>`,
`passwd.plaintext-password.<username>` and `passwd.shell.<username>` to
@@ -382,7 +394,8 @@ qemu-system-x86_64 \
```
This boots the specified disk image via qemu, provisioning public key SSH access
-for the root user from the caller's key:
+for the root user from the caller's key, and sends a notification when booting
+has finished to a process on the host:
```
qemu-system-x86_64 \
@@ -396,8 +409,18 @@ qemu-system-x86_64 \
-drive if=none,id=hd,file=test.raw,format=raw \
-device virtio-scsi-pci,id=scsi \
-device scsi-hd,drive=hd,bootindex=1 \
+ -device vhost-vsock-pci,id=vhost-vsock-pci0,guest-cid=42 \
+ -smbios type=11,value=io.systemd.credential:vmm.notify_socket=vsock:2:1234 \
-smbios type=11,value=io.systemd.credential.binary:tmpfiles.extra=$(echo "f~ /root/.ssh/authorized_keys 700 root root - $(ssh-add -L | base64 -w 0)" | base64 -w 0)
```
+
+A process on the host can listen for the notification, for example:
+
+```
+$ socat - VSOCK-LISTEN:1234,socktype=5
+READY=1
+```
+
## Relevant Paths
From *service* perspective the runtime path to find loaded credentials in is
diff --git a/man/sd_notify.xml b/man/sd_notify.xml
index de402950bb..021cd0384f 100644
--- a/man/sd_notify.xml
+++ b/man/sd_notify.xml
@@ -368,13 +368,26 @@
<xi:include href="libsystemd-pkgconfig.xml" xpointer="pkgconfig-text"/>
<para>These functions send a single datagram with the
- state string as payload to the <constant>AF_UNIX</constant> socket
- referenced in the <varname>$NOTIFY_SOCKET</varname> environment
- variable. If the first character of
- <varname>$NOTIFY_SOCKET</varname> is <literal>@</literal>, the
- string is understood as Linux abstract namespace socket. The
- datagram is accompanied by the process credentials of the sending
- service, using SCM_CREDENTIALS.</para>
+ state string as payload to the socket referenced in the
+ <varname>$NOTIFY_SOCKET</varname> environment variable. If the
+ first character of <varname>$NOTIFY_SOCKET</varname> is
+ <literal>/</literal> or <literal>@</literal>, the string is understood
+ as an <constant>AF_UNIX</constant> or Linux abstract namespace socket
+ (respectively), and in both cases the datagram is accompanied by the
+ process credentials of the sending service, using SCM_CREDENTIALS. If
+ the string starts with <literal>vsock:</literal> then the string is
+ understood as an <constant>AF_VSOCK</constant> address, which is useful
+ for hypervisors/VMMs or other processes on the host to receive a
+ notification when a virtual machine has finished booting. Note that in
+ case the hypervisor does not support <constant>SOCK_DGRAM</constant>
+ over <constant>AF_VSOCK</constant>, <constant>SOCK_SEQPACKET</constant>
+ will be used instead. The address should be in the form:
+ <literal>vsock:CID:PORT</literal>. Note that unlike other uses of vsock,
+ the CID is mandatory and cannot be <literal>VMADDR_CID_ANY</literal>.
+ Note that PID1 will send the VSOCK packets from a privileged port
+ (i.e.: lower than 1024), as an attempt to address concerns that unprivileged
+ processes in the guest might try to send malicious notifications to the
+ host, driving it to make destructive decisions based on them.</para>
</refsect1>
<refsect1>
diff --git a/man/systemd.system-credentials.xml b/man/systemd.system-credentials.xml
index 9e49e3feae..95a437adc4 100644
--- a/man/systemd.system-credentials.xml
+++ b/man/systemd.system-credentials.xml
@@ -199,6 +199,24 @@
</listitem>
</varlistentry>
+ <varlistentry>
+ <term><varname>vmm.notify_socket</varname></term>
+ <listitem>
+ <para>This credential is parsed looking for an <constant>AF_VSOCK</constant> or
+ <constant>AF_UNIX</constant> address where to send a <constant>READY=1</constant>
+ notification datagram when the system has finished booting. See:
+ <citerefentry><refentrytitle>sd_notify</refentrytitle><manvolnum>3</manvolnum></citerefentry>
+ This is useful for hypervisors/VMMs or other processes on the host
+ to receive a notification via VSOCK when a virtual machine has finished booting.
+ Note that in case the hypervisor does not support <constant>SOCK_DGRAM</constant>
+ over <constant>AF_VSOCK</constant>, <constant>SOCK_SEQPACKET</constant> will be
+ tried instead. The credential payload for <constant>AF_VSOCK</constant> should be
+ in the form: <literal>vsock:CID:PORT</literal>, where <literal>CID</literal> is
+ optional and if omitted will default to talking to the hypervisor
+ (<constant>0</constant>).</para>
+ </listitem>
+ </varlistentry>
+
</variablelist>
</refsect1>
diff --git a/src/basic/socket-util.c b/src/basic/socket-util.c
index 54f5f1cc5b..d7946a3641 100644
--- a/src/basic/socket-util.c
+++ b/src/basic/socket-util.c
@@ -1472,3 +1472,71 @@ int connect_unix_path(int fd, int dir_fd, const char *path) {
return RET_NERRNO(connect(fd, &sa.sa, salen));
}
+
+int socket_address_parse_unix(SocketAddress *ret_address, const char *s) {
+ struct sockaddr_un un;
+ int r;
+
+ assert(ret_address);
+ assert(s);
+
+ if (!IN_SET(*s, '/', '@'))
+ return -EPROTO;
+
+ r = sockaddr_un_set_path(&un, s);
+ if (r < 0)
+ return r;
+
+ *ret_address = (SocketAddress) {
+ .sockaddr.un = un,
+ .size = r,
+ };
+
+ return 0;
+}
+
+int socket_address_parse_vsock(SocketAddress *ret_address, const char *s) {
+ /* AF_VSOCK socket in vsock:cid:port notation */
+ _cleanup_free_ char *n = NULL;
+ char *e, *cid_start;
+ unsigned port, cid;
+ int r;
+
+ assert(ret_address);
+ assert(s);
+
+ cid_start = startswith(s, "vsock:");
+ if (!cid_start)
+ return -EPROTO;
+
+ e = strchr(cid_start, ':');
+ if (!e)
+ return -EINVAL;
+
+ r = safe_atou(e+1, &port);
+ if (r < 0)
+ return r;
+
+ n = strndup(cid_start, e - cid_start);
+ if (!n)
+ return -ENOMEM;
+
+ if (isempty(n))
+ cid = VMADDR_CID_ANY;
+ else {
+ r = safe_atou(n, &cid);
+ if (r < 0)
+ return r;
+ }
+
+ *ret_address = (SocketAddress) {
+ .sockaddr.vm = {
+ .svm_cid = cid,
+ .svm_family = AF_VSOCK,
+ .svm_port = port,
+ },
+ .size = sizeof(struct sockaddr_vm),
+ };
+
+ return 0;
+}
diff --git a/src/basic/socket-util.h b/src/basic/socket-util.h
index 0b8d53e895..5cb35f65fb 100644
--- a/src/basic/socket-util.h
+++ b/src/basic/socket-util.h
@@ -336,3 +336,9 @@ int socket_get_mtu(int fd, int af, size_t *ret);
#define UCRED_INVALID { .pid = 0, .uid = UID_INVALID, .gid = GID_INVALID }
int connect_unix_path(int fd, int dir_fd, const char *path);
+
+/* Parses AF_UNIX and AF_VSOCK addresses. AF_INET[6] require some netlink calls, so it cannot be in
+ * src/basic/ and is done from 'socket_local_address from src/shared/. Return -EPROTO in case of
+ * protocol mismatch. */
+int socket_address_parse_unix(SocketAddress *ret_address, const char *s);
+int socket_address_parse_vsock(SocketAddress *ret_address, const char *s);
diff --git a/src/core/import-creds.c b/src/core/import-creds.c
index 1f5a15f73b..ade509be34 100644
--- a/src/core/import-creds.c
+++ b/src/core/import-creds.c
@@ -713,5 +713,18 @@ int import_credentials(void) {
r = q;
}
+ if (r >= 0) {
+ _cleanup_free_ char *address = NULL;
+
+ r = read_credential("vmm.notify_socket", (void **)&address, /* ret_size= */ NULL);
+ if (r < 0 && !IN_SET(r, -ENOENT, -ENXIO))
+ log_warning_errno(r, "Failed to read 'vmm.notify_socket' credential, ignoring: %m");
+ else if (r >= 0 && !isempty(address)) {
+ r = setenv("NOTIFY_SOCKET", address, /* replace= */ 1);
+ if (r < 0)
+ log_warning_errno(errno, "Failed to set $NOTIFY_SOCKET environment variable, ignoring: %m");
+ }
+ }
+
return r;
}
diff --git a/src/libsystemd/sd-daemon/sd-daemon.c b/src/libsystemd/sd-daemon/sd-daemon.c
index 6da351dd9b..8dc11aeb30 100644
--- a/src/libsystemd/sd-daemon/sd-daemon.c
+++ b/src/libsystemd/sd-daemon/sd-daemon.c
@@ -433,6 +433,23 @@ _public_ int sd_is_mq(int fd, const char *path) {
return 1;
}
+static int vsock_bind_privileged_port(int fd) {
+ union sockaddr_union sa = {
+ .vm.svm_family = AF_VSOCK,
+ .vm.svm_cid = VMADDR_CID_ANY,
+ .vm.svm_port = 1023,
+ };
+ int r;
+
+ assert(fd >= 0);
+
+ do
+ r = RET_NERRNO(bind(fd, &sa.sa, sizeof(sa.vm)));
+ while (r == -EADDRINUSE && --sa.vm.svm_port > 0);
+
+ return r;
+}
+
_public_ int sd_pid_notify_with_fds(
pid_t pid,
int unset_environment,
@@ -440,12 +457,12 @@ _public_ int sd_pid_notify_with_fds(
const int *fds,
unsigned n_fds) {
- union sockaddr_union sockaddr;
+ SocketAddress address;
struct iovec iovec;
struct msghdr msghdr = {
.msg_iov = &iovec,
.msg_iovlen = 1,
- .msg_name = &sockaddr,
+ .msg_name = &address.sockaddr,
};
_cleanup_close_ int fd = -EBADF;
struct cmsghdr *cmsg = NULL;
@@ -467,17 +484,53 @@ _public_ int sd_pid_notify_with_fds(
if (!e)
return 0;
- r = sockaddr_un_set_path(&sockaddr.un, e);
+ /* Allow AF_UNIX and AF_VSOCK, reject the rest. */
+ r = socket_address_parse_unix(&address, e);
+ if (r == -EPROTO)
+ r = socket_address_parse_vsock(&address, e);
if (r < 0)
goto finish;
- msghdr.msg_namelen = r;
+ msghdr.msg_namelen = address.size;
- fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0);
- if (fd < 0) {
- r = -errno;
+ /* If we didn't get an address (which is a normal pattern when specifying VSOCK tuples) error out,
+ * we always require a specific CID. */
+ if (address.sockaddr.vm.svm_family == AF_VSOCK && address.sockaddr.vm.svm_cid == VMADDR_CID_ANY) {
+ r = -EINVAL;
goto finish;
}
+ /* At the time of writing QEMU does not yet support AF_VSOCK + SOCK_DGRAM and returns
+ * ENODEV. Fallback to SOCK_SEQPACKET in that case. */
+ fd = socket(address.sockaddr.sa.sa_family, SOCK_DGRAM|SOCK_CLOEXEC, 0);
+ if (fd < 0) {
+ if (!(ERRNO_IS_NOT_SUPPORTED(errno) || errno == ENODEV) || address.sockaddr.sa.sa_family != AF_VSOCK) {
+ r = -errno;
+ goto finish;
+ }
+
+ fd = socket(address.sockaddr.sa.sa_family, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
+ if (fd < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ r = vsock_bind_privileged_port(fd);
+ if (r < 0 && !ERRNO_IS_PRIVILEGE(r))
+ goto finish;
+
+ if (connect(fd, &address.sockaddr.sa, address.size) < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ msghdr.msg_name = NULL;
+ msghdr.msg_namelen = 0;
+ } else if (address.sockaddr.sa.sa_family == AF_VSOCK) {
+ r = vsock_bind_privileged_port(fd);
+ if (r < 0 && !ERRNO_IS_PRIVILEGE(r))
+ goto finish;
+ }
+
(void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
iovec = IOVEC_MAKE_STRING(state);
diff --git a/src/shared/socket-netlink.c b/src/shared/socket-netlink.c
index 494047a5d1..e115dff506 100644
--- a/src/shared/socket-netlink.c
+++ b/src/shared/socket-netlink.c
@@ -17,120 +17,74 @@
#include "string-util.h"
int socket_address_parse(SocketAddress *a, const char *s) {
- _cleanup_free_ char *n = NULL;
- char *e;
+ uint16_t port;
int r;
assert(a);
assert(s);
- if (IN_SET(*s, '/', '@')) {
- /* AF_UNIX socket */
- struct sockaddr_un un;
-
- r = sockaddr_un_set_path(&un, s);
- if (r < 0)
- return r;
-
- *a = (SocketAddress) {
- .sockaddr.un = un,
- .size = r,
- };
+ r = socket_address_parse_unix(a, s);
+ if (r == -EPROTO)
+ r = socket_address_parse_vsock(a, s);
+ if (r != -EPROTO)
+ return r;
- } else if (startswith(s, "vsock:")) {
- /* AF_VSOCK socket in vsock:cid:port notation */
- const char *cid_start = s + STRLEN("vsock:");
- unsigned port, cid;
+ r = parse_ip_port(s, &port);
+ if (r == -ERANGE)
+ return r; /* Valid port syntax, but the numerical value is wrong for a port. */
+ if (r >= 0) {
+ /* Just a port */
+ if (socket_ipv6_is_supported())
+ *a = (SocketAddress) {
+ .sockaddr.in6 = {
+ .sin6_family = AF_INET6,
+ .sin6_port = htobe16(port),
+ .sin6_addr = in6addr_any,
+ },
+ .size = sizeof(struct sockaddr_in6),
+ };
+ else
+ *a = (SocketAddress) {
+ .sockaddr.in = {
+ .sin_family = AF_INET,
+ .sin_port = htobe16(port),
+ .sin_addr.s_addr = INADDR_ANY,
+ },
+ .size = sizeof(struct sockaddr_in),
+ };
- e = strchr(cid_start, ':');
- if (!e)
- return -EINVAL;
+ } else {
+ union in_addr_union address;
+ int family, ifindex;
- r = safe_atou(e+1, &port);
+ r = in_addr_port_ifindex_name_from_string_auto(s, &family, &address, &port, &ifindex, NULL);
if (r < 0)
return r;
- n = strndup(cid_start, e - cid_start);
- if (!n)
- return -ENOMEM;
-
- if (isempty(n))
- cid = VMADDR_CID_ANY;
- else {
- r = safe_atou(n, &cid);
- if (r < 0)
- return r;
- }
-
- *a = (SocketAddress) {
- .sockaddr.vm = {
- .svm_cid = cid,
- .svm_family = AF_VSOCK,
- .svm_port = port,
- },
- .size = sizeof(struct sockaddr_vm),
- };
-
- } else {
- uint16_t port;
-
- r = parse_ip_port(s, &port);
- if (r == -ERANGE)
- return r; /* Valid port syntax, but the numerical value is wrong for a port. */
- if (r >= 0) {
- /* Just a port */
- if (socket_ipv6_is_supported())
- *a = (SocketAddress) {
- .sockaddr.in6 = {
- .sin6_family = AF_INET6,
- .sin6_port = htobe16(port),
- .sin6_addr = in6addr_any,
- },
- .size = sizeof(struct sockaddr_in6),
- };
- else
- *a = (SocketAddress) {
- .sockaddr.in = {
- .sin_family = AF_INET,
- .sin_port = htobe16(port),
- .sin_addr.s_addr = INADDR_ANY,
- },
- .size = sizeof(struct sockaddr_in),
- };
-
- } else {
- union in_addr_union address;
- int family, ifindex;
-
- r = in_addr_port_ifindex_name_from_string_auto(s, &family, &address, &port, &ifindex, NULL);
- if (r < 0)
- return r;
-
- if (port == 0) /* No port, no go. */
- return -EINVAL;
+ if (port == 0) /* No port, no go. */
+ return -EINVAL;
- if (family == AF_INET)
- *a = (SocketAddress) {
- .sockaddr.in = {
- .sin_family = AF_INET,
- .sin_addr = address.in,
- .sin_port = htobe16(port),
- },
- .size = sizeof(struct sockaddr_in),
- };
- else if (family == AF_INET6)
- *a = (SocketAddress) {
- .sockaddr.in6 = {
- .sin6_family = AF_INET6,
- .sin6_addr = address.in6,
- .sin6_port = htobe16(port),
- .sin6_scope_id = ifindex,
- },
- .size = sizeof(struct sockaddr_in6),
- };
- else
- assert_not_reached();
- }
+ if (family == AF_INET)
+ *a = (SocketAddress) {
+ .sockaddr.in = {
+ .sin_family = AF_INET,
+ .sin_addr = address.in,
+ .sin_port = htobe16(port),
+ },
+ .size = sizeof(struct sockaddr_in),
+ };
+ else if (family == AF_INET6)
+ *a = (SocketAddress) {
+ .sockaddr.in6 = {
+ .sin6_family = AF_INET6,
+ .sin6_addr = address.in6,
+ .sin6_port = htobe16(port),
+ .sin6_scope_id = ifindex,
+ },
+ .size = sizeof(struct sockaddr_in6),
+ };
+ else
+ assert_not_reached();
}
return 0;