summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAaron Conole <aconole@redhat.com>2023-03-16 08:00:39 -0400
committerIlya Maximets <i.maximets@ovn.org>2023-03-22 18:56:02 +0100
commit07cf5810de8da12c700324bc421bde92376abe06 (patch)
treecae90caceda929ae05aa2028bb023f64c6f69bfd
parente90a0727f17f6ad915a32735a8c0b282f2c8cd6f (diff)
downloadopenvswitch-07cf5810de8da12c700324bc421bde92376abe06.tar.gz
dpdk: Allow retaining CAP_SYS_RAWIO privileges.
Open vSwitch generally tries to let the underlying operating system managed the low level details of hardware, for example DMA mapping, bus arbitration, etc. However, when using DPDK, the underlying operating system yields control of many of these details to userspace for management. In the case of some DPDK port drivers, configuring rte_flow or even allocating resources may require access to iopl/ioperm calls, which are guarded by the CAP_SYS_RAWIO privilege on linux systems. These calls are dangerous, and can allow a process to completely compromise a system. However, they are needed in the case of some userspace driver code which manages the hardware (for example, the mlx implementation of backend support for rte_flow). Here, we create an opt-in flag passed to the command line to allow this access. We need to do this before ever accessing the database, because we want to drop all privileges asap, and cannot wait for a connection to the database to be established and functional before dropping. There may be distribution specific ways to do capability management as well (using for example, systemd), but they are not as universal to the vswitchd as a flag. Reviewed-by: Simon Horman <simon.horman@corigine.com> Signed-off-by: Aaron Conole <aconole@redhat.com> Acked-by: Flavio Leitner <fbl@sysclose.org> Acked-by: Gaetan Rivet <gaetanr@nvidia.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
-rw-r--r--NEWS4
-rw-r--r--lib/daemon-unix.c33
-rw-r--r--lib/daemon-windows.c6
-rw-r--r--lib/daemon.c2
-rw-r--r--lib/daemon.h4
-rw-r--r--ovsdb/ovsdb-client.c6
-rw-r--r--ovsdb/ovsdb-server.c4
-rw-r--r--tests/test-netflow.c2
-rw-r--r--tests/test-sflow.c2
-rw-r--r--tests/test-unixctl.c2
-rw-r--r--utilities/ovs-ofctl.c4
-rw-r--r--utilities/ovs-testcontroller.c4
-rw-r--r--vswitchd/ovs-vswitchd.8.in9
-rw-r--r--vswitchd/ovs-vswitchd.c11
14 files changed, 67 insertions, 26 deletions
diff --git a/NEWS b/NEWS
index 72b9024e6..8771ee618 100644
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,10 @@ Post-v3.1.0
in order to create OVSDB sockets with access mode of 0770.
- QoS:
* Added new configuration option 'jitter' for a linux-netem QoS type.
+ - DPDK:
+ * ovs-vswitchd will keep the CAP_SYS_RAWIO capability when started
+ with the --hw-rawio-access command line option. This allows the
+ process extra privileges when mapping physical interconnect memory.
v3.1.0 - 16 Feb 2023
diff --git a/lib/daemon-unix.c b/lib/daemon-unix.c
index 1a7ba427d..4fdc6e3c4 100644
--- a/lib/daemon-unix.c
+++ b/lib/daemon-unix.c
@@ -88,7 +88,8 @@ static bool switch_user = false;
static uid_t uid;
static gid_t gid;
static char *user = NULL;
-static void daemon_become_new_user__(bool access_datapath);
+static void daemon_become_new_user__(bool access_datapath,
+ bool access_hardware_ports);
static void check_already_running(void);
static int lock_pidfile(FILE *, int command);
@@ -443,13 +444,13 @@ monitor_daemon(pid_t daemon_pid)
* daemonize_complete()) or that it failed to start up (by exiting with a
* nonzero exit code). */
void
-daemonize_start(bool access_datapath)
+daemonize_start(bool access_datapath, bool access_hardware_ports)
{
assert_single_threaded();
daemonize_fd = -1;
if (switch_user) {
- daemon_become_new_user__(access_datapath);
+ daemon_become_new_user__(access_datapath, access_hardware_ports);
switch_user = false;
}
@@ -807,7 +808,8 @@ daemon_become_new_user_unix(void)
/* Linux specific implementation of daemon_become_new_user()
* using libcap-ng. */
static void
-daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
+daemon_become_new_user_linux(bool access_datapath OVS_UNUSED,
+ bool access_hardware_ports OVS_UNUSED)
{
#if defined __linux__ && HAVE_LIBCAPNG
int ret;
@@ -827,6 +829,20 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
ret = capng_update(CAPNG_ADD, cap_sets, CAP_NET_ADMIN)
|| capng_update(CAPNG_ADD, cap_sets, CAP_NET_RAW)
|| capng_update(CAPNG_ADD, cap_sets, CAP_NET_BROADCAST);
+#ifdef DPDK_NETDEV
+ if (access_hardware_ports && !ret) {
+ ret = capng_update(CAPNG_ADD, cap_sets, CAP_SYS_RAWIO);
+ if (!ret) {
+ VLOG_INFO("The Linux capability CAP_SYS_RAWIO "
+ "is enabled.");
+ }
+ }
+#else
+ if (access_hardware_ports) {
+ VLOG_WARN("No driver requires Linux capability "
+ "CAP_SYS_RAWIO, disabling it.");
+ }
+#endif
}
} else {
ret = -1;
@@ -854,7 +870,7 @@ daemon_become_new_user_linux(bool access_datapath OVS_UNUSED)
}
static void
-daemon_become_new_user__(bool access_datapath)
+daemon_become_new_user__(bool access_datapath, bool access_hardware_ports)
{
/* If vlog file has been created, change its owner to the non-root user
* as specifed by the --user option. */
@@ -862,7 +878,8 @@ daemon_become_new_user__(bool access_datapath)
if (LINUX) {
if (LIBCAPNG) {
- daemon_become_new_user_linux(access_datapath);
+ daemon_become_new_user_linux(access_datapath,
+ access_hardware_ports);
} else {
VLOG_FATAL("%s: fail to downgrade user using libcap-ng. "
"(libcap-ng is not configured at compile time), "
@@ -877,11 +894,11 @@ daemon_become_new_user__(bool access_datapath)
* However, there in case the user switch needs to be done
* before daemonize_start(), the following API can be used. */
void
-daemon_become_new_user(bool access_datapath)
+daemon_become_new_user(bool access_datapath, bool access_hardware_ports)
{
assert_single_threaded();
if (switch_user) {
- daemon_become_new_user__(access_datapath);
+ daemon_become_new_user__(access_datapath, access_hardware_ports);
/* daemonize_start() should not switch user again. */
switch_user = false;
}
diff --git a/lib/daemon-windows.c b/lib/daemon-windows.c
index 7e5f264f5..4e6bbe0f0 100644
--- a/lib/daemon-windows.c
+++ b/lib/daemon-windows.c
@@ -498,7 +498,8 @@ make_pidfile(void)
}
void
-daemonize_start(bool access_datapath OVS_UNUSED)
+daemonize_start(bool access_datapath OVS_UNUSED,
+ bool access_hardware_ports OVS_UNUSED)
{
if (pidfile) {
make_pidfile();
@@ -526,7 +527,8 @@ daemonize_complete(void)
}
void
-daemon_become_new_user(bool access_datapath OVS_UNUSED)
+daemon_become_new_user(bool access_datapath OVS_UNUSED,
+ bool access_hardware_ports OVS_UNUSED)
{
}
diff --git a/lib/daemon.c b/lib/daemon.c
index 3249c5ab4..1e1c019eb 100644
--- a/lib/daemon.c
+++ b/lib/daemon.c
@@ -48,7 +48,7 @@ get_detach(void)
void
daemonize(void)
{
- daemonize_start(false);
+ daemonize_start(false, false);
daemonize_complete();
}
diff --git a/lib/daemon.h b/lib/daemon.h
index 094157496..42372d146 100644
--- a/lib/daemon.h
+++ b/lib/daemon.h
@@ -167,10 +167,10 @@ void set_detach(void);
bool get_detach(void);
void daemon_save_fd(int fd);
void daemonize(void);
-void daemonize_start(bool access_datapath);
+void daemonize_start(bool access_datapath, bool access_hardware_ports);
void daemonize_complete(void);
void daemon_set_new_user(const char * user_spec);
-void daemon_become_new_user(bool access_datapath);
+void daemon_become_new_user(bool access_datapath, bool access_hardware_ports);
void daemon_usage(void);
void daemon_disable_self_confinement(void);
bool daemon_should_self_confine(void);
diff --git a/ovsdb/ovsdb-client.c b/ovsdb/ovsdb-client.c
index f1b8d6491..bae2c5f04 100644
--- a/ovsdb/ovsdb-client.c
+++ b/ovsdb/ovsdb-client.c
@@ -250,7 +250,7 @@ main(int argc, char *argv[])
parse_options(argc, argv);
fatal_ignore_sigpipe();
- daemon_become_new_user(false);
+ daemon_become_new_user(false, false);
if (optind >= argc) {
ovs_fatal(0, "missing command name; use --help for help");
}
@@ -1392,7 +1392,7 @@ do_monitor__(struct jsonrpc *rpc, const char *database,
daemon_save_fd(STDOUT_FILENO);
daemon_save_fd(STDERR_FILENO);
- daemonize_start(false);
+ daemonize_start(false, false);
if (get_detach()) {
int error;
@@ -2276,7 +2276,7 @@ do_lock(struct jsonrpc *rpc, const char *method, const char *lock)
getting a reply of the previous
request. */
daemon_save_fd(STDOUT_FILENO);
- daemonize_start(false);
+ daemonize_start(false, false);
lock_req_init(&lock_req, method, lock);
if (get_detach()) {
diff --git a/ovsdb/ovsdb-server.c b/ovsdb/ovsdb-server.c
index 33ca4910d..4fea2dbda 100644
--- a/ovsdb/ovsdb-server.c
+++ b/ovsdb/ovsdb-server.c
@@ -341,7 +341,7 @@ main(int argc, char *argv[])
&run_command, &sync_from, &sync_exclude, &active);
is_backup = sync_from && !active;
- daemon_become_new_user(false);
+ daemon_become_new_user(false, false);
/* Create and initialize 'config_tmpfile' as a temporary file to hold
* ovsdb-server's most basic configuration, and then save our initial
@@ -359,7 +359,7 @@ main(int argc, char *argv[])
save_config__(config_tmpfile, &remotes, &db_filenames, sync_from,
sync_exclude, is_backup);
- daemonize_start(false);
+ daemonize_start(false, false);
/* Load the saved config. */
load_config(config_tmpfile, &remotes, &db_filenames, &sync_from,
diff --git a/tests/test-netflow.c b/tests/test-netflow.c
index d2322d450..7f89cfcae 100644
--- a/tests/test-netflow.c
+++ b/tests/test-netflow.c
@@ -195,7 +195,7 @@ test_netflow_main(int argc, char *argv[])
}
daemon_save_fd(STDOUT_FILENO);
- daemonize_start(false);
+ daemonize_start(false, false);
error = unixctl_server_create(NULL, &server);
if (error) {
diff --git a/tests/test-sflow.c b/tests/test-sflow.c
index 460d4d6c5..3c617bdd1 100644
--- a/tests/test-sflow.c
+++ b/tests/test-sflow.c
@@ -709,7 +709,7 @@ test_sflow_main(int argc, char *argv[])
}
daemon_save_fd(STDOUT_FILENO);
- daemonize_start(false);
+ daemonize_start(false, false);
error = unixctl_server_create(NULL, &server);
if (error) {
diff --git a/tests/test-unixctl.c b/tests/test-unixctl.c
index 3eadf54cd..9e8982789 100644
--- a/tests/test-unixctl.c
+++ b/tests/test-unixctl.c
@@ -83,7 +83,7 @@ test_unixctl_main(int argc, char *argv[])
fatal_ignore_sigpipe();
parse_options(&argc, &argv, &unixctl_path);
- daemonize_start(false);
+ daemonize_start(false, false);
int retval = unixctl_server_create(unixctl_path, &unixctl);
if (retval) {
exit(EXIT_FAILURE);
diff --git a/utilities/ovs-ofctl.c b/utilities/ovs-ofctl.c
index 3ce4e82ec..24d0941cf 100644
--- a/utilities/ovs-ofctl.c
+++ b/utilities/ovs-ofctl.c
@@ -173,7 +173,7 @@ main(int argc, char *argv[])
ctx.argc = argc - optind;
ctx.argv = argv + optind;
- daemon_become_new_user(false);
+ daemon_become_new_user(false, false);
if (read_only) {
ovs_cmdl_run_command_read_only(&ctx, get_all_commands());
} else {
@@ -2127,7 +2127,7 @@ monitor_vconn(struct vconn *vconn, bool reply_to_echo_requests,
int error;
daemon_save_fd(STDERR_FILENO);
- daemonize_start(false);
+ daemonize_start(false, false);
error = unixctl_server_create(unixctl_path, &server);
if (error) {
ovs_fatal(error, "failed to create unixctl server");
diff --git a/utilities/ovs-testcontroller.c b/utilities/ovs-testcontroller.c
index b489ff5fc..9f2fbfdf5 100644
--- a/utilities/ovs-testcontroller.c
+++ b/utilities/ovs-testcontroller.c
@@ -109,7 +109,7 @@ main(int argc, char *argv[])
parse_options(argc, argv);
fatal_ignore_sigpipe();
- daemon_become_new_user(false);
+ daemon_become_new_user(false, false);
if (argc - optind < 1) {
ovs_fatal(0, "at least one vconn argument required; "
@@ -148,7 +148,7 @@ main(int argc, char *argv[])
ovs_fatal(0, "no active or passive switch connections");
}
- daemonize_start(false);
+ daemonize_start(false, false);
retval = unixctl_server_create(unixctl_path, &unixctl);
if (retval) {
diff --git a/vswitchd/ovs-vswitchd.8.in b/vswitchd/ovs-vswitchd.8.in
index 9569265fc..10c6e077b 100644
--- a/vswitchd/ovs-vswitchd.8.in
+++ b/vswitchd/ovs-vswitchd.8.in
@@ -81,6 +81,15 @@ unavailable or unsuccessful.
.SS "DPDK Options"
For details on initializing \fBovs\-vswitchd\fR to use DPDK ports,
refer to the documentation or \fBovs\-vswitchd.conf.db\fR(5).
+.SS "DPDK HW Access Options"
+.IP "\fB\-\-hw\-rawio\-access\fR"
+Tells \fBovs\-vswitchd\fR to retain the \fBCAP_SYS_RAWIO\fR capability,
+to allow userspace drivers access to raw hardware memory. This will
+also allow the \fBovs\-vswitchd\fR daemon to call \fBiopl()\fR and
+\fBioperm()\fR functions as well as access memory devices to set port
+access. This is a \fBvery\fR powerful capability, so generally only
+enable as needed for specific hardware (for example mlx5 with full
+hardware offload via rte_flow).
.SS "Daemon Options"
.ds DD \
\fBovs\-vswitchd\fR detaches only after it has connected to the \
diff --git a/vswitchd/ovs-vswitchd.c b/vswitchd/ovs-vswitchd.c
index 407bfc60e..a244d2f70 100644
--- a/vswitchd/ovs-vswitchd.c
+++ b/vswitchd/ovs-vswitchd.c
@@ -60,6 +60,9 @@ VLOG_DEFINE_THIS_MODULE(vswitchd);
* the kernel from paging any of its memory to disk. */
static bool want_mlockall;
+/* --hw-rawio-access: If set, retains CAP_SYS_RAWIO privileges. */
+static bool hw_rawio_access;
+
static unixctl_cb_func ovs_vswitchd_exit;
static char *parse_options(int argc, char *argv[], char **unixctl_path);
@@ -89,7 +92,7 @@ main(int argc, char *argv[])
remote = parse_options(argc, argv, &unixctl_path);
fatal_ignore_sigpipe();
- daemonize_start(true);
+ daemonize_start(true, hw_rawio_access);
if (want_mlockall) {
#ifdef HAVE_MLOCKALL
@@ -169,6 +172,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
OPT_DPDK,
SSL_OPTION_ENUMS,
OPT_DUMMY_NUMA,
+ OPT_HW_RAWIO_ACCESS,
};
static const struct option long_options[] = {
{"help", no_argument, NULL, 'h'},
@@ -185,6 +189,7 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
{"disable-system-route", no_argument, NULL, OPT_DISABLE_SYSTEM_ROUTE},
{"dpdk", optional_argument, NULL, OPT_DPDK},
{"dummy-numa", required_argument, NULL, OPT_DUMMY_NUMA},
+ {"hw-rawio-access", no_argument, NULL, OPT_HW_RAWIO_ACCESS},
{NULL, 0, NULL, 0},
};
char *short_options = ovs_cmdl_long_options_to_short_options(long_options);
@@ -249,6 +254,10 @@ parse_options(int argc, char *argv[], char **unixctl_pathp)
ovs_numa_set_dummy(optarg);
break;
+ case OPT_HW_RAWIO_ACCESS:
+ hw_rawio_access = true;
+ break;
+
default:
abort();
}