diff options
author | Luca Boccassi <bluca@debian.org> | 2023-01-03 18:08:09 +0100 |
---|---|---|
committer | Luca Boccassi <bluca@debian.org> | 2023-01-05 23:07:16 +0100 |
commit | 6c94cfcda5387bcec36867c7639c777179d6e7c2 (patch) | |
tree | 78fdb10d039209e3e8affc2ac8466ca2d8611c02 | |
parent | 747b5d963ef8078032e1f6f7ee98f8725d8fb454 (diff) | |
download | systemd-6c94cfcda5387bcec36867c7639c777179d6e7c2.tar.gz |
sd_notify: support AF_VSOCK
Allow sending notifications via AF_VSOCK, so that VMs can communicate
to the hypervisor/VMM that they are finished booting.
Note that if the hypervisor does not support SOCK_DGRAM over AF_VSOCK
(ie: qemu at the time of writing), SOCK_SEQPACKET will be used instead.
-rw-r--r-- | man/sd_notify.xml | 27 | ||||
-rw-r--r-- | src/libsystemd/sd-daemon/sd-daemon.c | 67 |
2 files changed, 80 insertions, 14 deletions
diff --git a/man/sd_notify.xml b/man/sd_notify.xml index de402950bb..021cd0384f 100644 --- a/man/sd_notify.xml +++ b/man/sd_notify.xml @@ -368,13 +368,26 @@ <xi:include href="libsystemd-pkgconfig.xml" xpointer="pkgconfig-text"/> <para>These functions send a single datagram with the - state string as payload to the <constant>AF_UNIX</constant> socket - referenced in the <varname>$NOTIFY_SOCKET</varname> environment - variable. If the first character of - <varname>$NOTIFY_SOCKET</varname> is <literal>@</literal>, the - string is understood as Linux abstract namespace socket. The - datagram is accompanied by the process credentials of the sending - service, using SCM_CREDENTIALS.</para> + state string as payload to the socket referenced in the + <varname>$NOTIFY_SOCKET</varname> environment variable. If the + first character of <varname>$NOTIFY_SOCKET</varname> is + <literal>/</literal> or <literal>@</literal>, the string is understood + as an <constant>AF_UNIX</constant> or Linux abstract namespace socket + (respectively), and in both cases the datagram is accompanied by the + process credentials of the sending service, using SCM_CREDENTIALS. If + the string starts with <literal>vsock:</literal> then the string is + understood as an <constant>AF_VSOCK</constant> address, which is useful + for hypervisors/VMMs or other processes on the host to receive a + notification when a virtual machine has finished booting. Note that in + case the hypervisor does not support <constant>SOCK_DGRAM</constant> + over <constant>AF_VSOCK</constant>, <constant>SOCK_SEQPACKET</constant> + will be used instead. The address should be in the form: + <literal>vsock:CID:PORT</literal>. Note that unlike other uses of vsock, + the CID is mandatory and cannot be <literal>VMADDR_CID_ANY</literal>. + Note that PID1 will send the VSOCK packets from a privileged port + (i.e.: lower than 1024), as an attempt to address concerns that unprivileged + processes in the guest might try to send malicious notifications to the + host, driving it to make destructive decisions based on them.</para> </refsect1> <refsect1> diff --git a/src/libsystemd/sd-daemon/sd-daemon.c b/src/libsystemd/sd-daemon/sd-daemon.c index 6da351dd9b..8dc11aeb30 100644 --- a/src/libsystemd/sd-daemon/sd-daemon.c +++ b/src/libsystemd/sd-daemon/sd-daemon.c @@ -433,6 +433,23 @@ _public_ int sd_is_mq(int fd, const char *path) { return 1; } +static int vsock_bind_privileged_port(int fd) { + union sockaddr_union sa = { + .vm.svm_family = AF_VSOCK, + .vm.svm_cid = VMADDR_CID_ANY, + .vm.svm_port = 1023, + }; + int r; + + assert(fd >= 0); + + do + r = RET_NERRNO(bind(fd, &sa.sa, sizeof(sa.vm))); + while (r == -EADDRINUSE && --sa.vm.svm_port > 0); + + return r; +} + _public_ int sd_pid_notify_with_fds( pid_t pid, int unset_environment, @@ -440,12 +457,12 @@ _public_ int sd_pid_notify_with_fds( const int *fds, unsigned n_fds) { - union sockaddr_union sockaddr; + SocketAddress address; struct iovec iovec; struct msghdr msghdr = { .msg_iov = &iovec, .msg_iovlen = 1, - .msg_name = &sockaddr, + .msg_name = &address.sockaddr, }; _cleanup_close_ int fd = -EBADF; struct cmsghdr *cmsg = NULL; @@ -467,17 +484,53 @@ _public_ int sd_pid_notify_with_fds( if (!e) return 0; - r = sockaddr_un_set_path(&sockaddr.un, e); + /* Allow AF_UNIX and AF_VSOCK, reject the rest. */ + r = socket_address_parse_unix(&address, e); + if (r == -EPROTO) + r = socket_address_parse_vsock(&address, e); if (r < 0) goto finish; - msghdr.msg_namelen = r; + msghdr.msg_namelen = address.size; - fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0); - if (fd < 0) { - r = -errno; + /* If we didn't get an address (which is a normal pattern when specifying VSOCK tuples) error out, + * we always require a specific CID. */ + if (address.sockaddr.vm.svm_family == AF_VSOCK && address.sockaddr.vm.svm_cid == VMADDR_CID_ANY) { + r = -EINVAL; goto finish; } + /* At the time of writing QEMU does not yet support AF_VSOCK + SOCK_DGRAM and returns + * ENODEV. Fallback to SOCK_SEQPACKET in that case. */ + fd = socket(address.sockaddr.sa.sa_family, SOCK_DGRAM|SOCK_CLOEXEC, 0); + if (fd < 0) { + if (!(ERRNO_IS_NOT_SUPPORTED(errno) || errno == ENODEV) || address.sockaddr.sa.sa_family != AF_VSOCK) { + r = -errno; + goto finish; + } + + fd = socket(address.sockaddr.sa.sa_family, SOCK_SEQPACKET|SOCK_CLOEXEC, 0); + if (fd < 0) { + r = -errno; + goto finish; + } + + r = vsock_bind_privileged_port(fd); + if (r < 0 && !ERRNO_IS_PRIVILEGE(r)) + goto finish; + + if (connect(fd, &address.sockaddr.sa, address.size) < 0) { + r = -errno; + goto finish; + } + + msghdr.msg_name = NULL; + msghdr.msg_namelen = 0; + } else if (address.sockaddr.sa.sa_family == AF_VSOCK) { + r = vsock_bind_privileged_port(fd); + if (r < 0 && !ERRNO_IS_PRIVILEGE(r)) + goto finish; + } + (void) fd_inc_sndbuf(fd, SNDBUF_SIZE); iovec = IOVEC_MAKE_STRING(state); |