summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuca Boccassi <bluca@debian.org>2023-01-03 18:08:09 +0100
committerLuca Boccassi <bluca@debian.org>2023-01-05 23:07:16 +0100
commit6c94cfcda5387bcec36867c7639c777179d6e7c2 (patch)
tree78fdb10d039209e3e8affc2ac8466ca2d8611c02
parent747b5d963ef8078032e1f6f7ee98f8725d8fb454 (diff)
downloadsystemd-6c94cfcda5387bcec36867c7639c777179d6e7c2.tar.gz
sd_notify: support AF_VSOCK
Allow sending notifications via AF_VSOCK, so that VMs can communicate to the hypervisor/VMM that they are finished booting. Note that if the hypervisor does not support SOCK_DGRAM over AF_VSOCK (ie: qemu at the time of writing), SOCK_SEQPACKET will be used instead.
-rw-r--r--man/sd_notify.xml27
-rw-r--r--src/libsystemd/sd-daemon/sd-daemon.c67
2 files changed, 80 insertions, 14 deletions
diff --git a/man/sd_notify.xml b/man/sd_notify.xml
index de402950bb..021cd0384f 100644
--- a/man/sd_notify.xml
+++ b/man/sd_notify.xml
@@ -368,13 +368,26 @@
<xi:include href="libsystemd-pkgconfig.xml" xpointer="pkgconfig-text"/>
<para>These functions send a single datagram with the
- state string as payload to the <constant>AF_UNIX</constant> socket
- referenced in the <varname>$NOTIFY_SOCKET</varname> environment
- variable. If the first character of
- <varname>$NOTIFY_SOCKET</varname> is <literal>@</literal>, the
- string is understood as Linux abstract namespace socket. The
- datagram is accompanied by the process credentials of the sending
- service, using SCM_CREDENTIALS.</para>
+ state string as payload to the socket referenced in the
+ <varname>$NOTIFY_SOCKET</varname> environment variable. If the
+ first character of <varname>$NOTIFY_SOCKET</varname> is
+ <literal>/</literal> or <literal>@</literal>, the string is understood
+ as an <constant>AF_UNIX</constant> or Linux abstract namespace socket
+ (respectively), and in both cases the datagram is accompanied by the
+ process credentials of the sending service, using SCM_CREDENTIALS. If
+ the string starts with <literal>vsock:</literal> then the string is
+ understood as an <constant>AF_VSOCK</constant> address, which is useful
+ for hypervisors/VMMs or other processes on the host to receive a
+ notification when a virtual machine has finished booting. Note that in
+ case the hypervisor does not support <constant>SOCK_DGRAM</constant>
+ over <constant>AF_VSOCK</constant>, <constant>SOCK_SEQPACKET</constant>
+ will be used instead. The address should be in the form:
+ <literal>vsock:CID:PORT</literal>. Note that unlike other uses of vsock,
+ the CID is mandatory and cannot be <literal>VMADDR_CID_ANY</literal>.
+ Note that PID1 will send the VSOCK packets from a privileged port
+ (i.e.: lower than 1024), as an attempt to address concerns that unprivileged
+ processes in the guest might try to send malicious notifications to the
+ host, driving it to make destructive decisions based on them.</para>
</refsect1>
<refsect1>
diff --git a/src/libsystemd/sd-daemon/sd-daemon.c b/src/libsystemd/sd-daemon/sd-daemon.c
index 6da351dd9b..8dc11aeb30 100644
--- a/src/libsystemd/sd-daemon/sd-daemon.c
+++ b/src/libsystemd/sd-daemon/sd-daemon.c
@@ -433,6 +433,23 @@ _public_ int sd_is_mq(int fd, const char *path) {
return 1;
}
+static int vsock_bind_privileged_port(int fd) {
+ union sockaddr_union sa = {
+ .vm.svm_family = AF_VSOCK,
+ .vm.svm_cid = VMADDR_CID_ANY,
+ .vm.svm_port = 1023,
+ };
+ int r;
+
+ assert(fd >= 0);
+
+ do
+ r = RET_NERRNO(bind(fd, &sa.sa, sizeof(sa.vm)));
+ while (r == -EADDRINUSE && --sa.vm.svm_port > 0);
+
+ return r;
+}
+
_public_ int sd_pid_notify_with_fds(
pid_t pid,
int unset_environment,
@@ -440,12 +457,12 @@ _public_ int sd_pid_notify_with_fds(
const int *fds,
unsigned n_fds) {
- union sockaddr_union sockaddr;
+ SocketAddress address;
struct iovec iovec;
struct msghdr msghdr = {
.msg_iov = &iovec,
.msg_iovlen = 1,
- .msg_name = &sockaddr,
+ .msg_name = &address.sockaddr,
};
_cleanup_close_ int fd = -EBADF;
struct cmsghdr *cmsg = NULL;
@@ -467,17 +484,53 @@ _public_ int sd_pid_notify_with_fds(
if (!e)
return 0;
- r = sockaddr_un_set_path(&sockaddr.un, e);
+ /* Allow AF_UNIX and AF_VSOCK, reject the rest. */
+ r = socket_address_parse_unix(&address, e);
+ if (r == -EPROTO)
+ r = socket_address_parse_vsock(&address, e);
if (r < 0)
goto finish;
- msghdr.msg_namelen = r;
+ msghdr.msg_namelen = address.size;
- fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0);
- if (fd < 0) {
- r = -errno;
+ /* If we didn't get an address (which is a normal pattern when specifying VSOCK tuples) error out,
+ * we always require a specific CID. */
+ if (address.sockaddr.vm.svm_family == AF_VSOCK && address.sockaddr.vm.svm_cid == VMADDR_CID_ANY) {
+ r = -EINVAL;
goto finish;
}
+ /* At the time of writing QEMU does not yet support AF_VSOCK + SOCK_DGRAM and returns
+ * ENODEV. Fallback to SOCK_SEQPACKET in that case. */
+ fd = socket(address.sockaddr.sa.sa_family, SOCK_DGRAM|SOCK_CLOEXEC, 0);
+ if (fd < 0) {
+ if (!(ERRNO_IS_NOT_SUPPORTED(errno) || errno == ENODEV) || address.sockaddr.sa.sa_family != AF_VSOCK) {
+ r = -errno;
+ goto finish;
+ }
+
+ fd = socket(address.sockaddr.sa.sa_family, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
+ if (fd < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ r = vsock_bind_privileged_port(fd);
+ if (r < 0 && !ERRNO_IS_PRIVILEGE(r))
+ goto finish;
+
+ if (connect(fd, &address.sockaddr.sa, address.size) < 0) {
+ r = -errno;
+ goto finish;
+ }
+
+ msghdr.msg_name = NULL;
+ msghdr.msg_namelen = 0;
+ } else if (address.sockaddr.sa.sa_family == AF_VSOCK) {
+ r = vsock_bind_privileged_port(fd);
+ if (r < 0 && !ERRNO_IS_PRIVILEGE(r))
+ goto finish;
+ }
+
(void) fd_inc_sndbuf(fd, SNDBUF_SIZE);
iovec = IOVEC_MAKE_STRING(state);