summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorDavid Wilder <dwilder@us.ibm.com>2021-06-22 11:53:08 -0700
committerIlya Maximets <i.maximets@ovn.org>2021-07-07 23:35:57 +0200
commit3da3cc1a0cf281cd671d672bbdb09002938ec14e (patch)
tree7b33614c1b39f8bb46ebc65722ce4fb7741cf23d
parent154983c592da8cf4ea6289b8b271176d0ad962b7 (diff)
downloadopenvswitch-3da3cc1a0cf281cd671d672bbdb09002938ec14e.tar.gz
ovs-numa: Support non-contiguous numa nodes and offline CPU cores.
This change removes the assumption that numa nodes and cores are numbered contiguously in linux. This change is required to support some Power systems. A check has been added to verify that cores are online, offline cores result in non-contiguously numbered cores. DPDK EAL option generation is updated to work with non-contiguous numa nodes. These options can be seen in the ovs-vswitchd.log. For example: a system containing only numa nodes 0 and 8 will generate the following: EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \ --socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0 Tests for pmd and dpif-netdev have been updated to validate non-contiguous numbered nodes. Signed-off-by: David Wilder <dwilder@us.ibm.com> Acked-by: Kevin Traynor <ktraynor@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
-rw-r--r--NEWS1
-rw-r--r--lib/dpdk.c57
-rw-r--r--lib/ovs-numa.c51
-rw-r--r--lib/ovs-numa.h2
-rw-r--r--tests/dpif-netdev.at2
-rw-r--r--tests/pmd.at61
6 files changed, 143 insertions, 31 deletions
diff --git a/NEWS b/NEWS
index f758a501d..dddd57fc2 100644
--- a/NEWS
+++ b/NEWS
@@ -16,6 +16,7 @@ Post-v2.15.0
* Auto load balancing of PMDs now partially supports cross-NUMA polling
cases, e.g if all PMD threads are running on the same NUMA node.
* Userspace datapath now supports up to 2^18 meters.
+ * Added support for systems with non-contiguous NUMA nodes and core ids.
- ovs-ctl:
* New option '--no-record-hostname' to disable hostname configuration
in ovsdb on startup.
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 2eaaa569c..0c910092c 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -130,22 +130,63 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args)
}
}
+static int
+compare_numa_node_list(const void *a_, const void *b_)
+{
+ int a = *(const int *) a_;
+ int b = *(const int *) b_;
+
+ if (a < b) {
+ return -1;
+ }
+ if (a > b) {
+ return 1;
+ }
+ return 0;
+}
+
static char *
construct_dpdk_socket_mem(void)
{
const char *def_value = "1024";
- int numa, numa_nodes = ovs_numa_get_n_numas();
struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER;
- if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) {
- numa_nodes = 1;
- }
+ /* Build a list of all numa nodes with at least one core. */
+ struct ovs_numa_dump *dump = ovs_numa_dump_n_cores_per_numa(1);
+ size_t n_numa_nodes = hmap_count(&dump->numas);
+ int *numa_node_list = xcalloc(n_numa_nodes, sizeof *numa_node_list);
- ds_put_cstr(&dpdk_socket_mem, def_value);
- for (numa = 1; numa < numa_nodes; ++numa) {
- ds_put_format(&dpdk_socket_mem, ",%s", def_value);
- }
+ const struct ovs_numa_info_numa *node;
+ int k = 0, last_node = 0;
+ FOR_EACH_NUMA_ON_DUMP(node, dump) {
+ if (k >= n_numa_nodes) {
+ break;
+ }
+ numa_node_list[k++] = node->numa_id;
+ }
+ qsort(numa_node_list, k, sizeof *numa_node_list, compare_numa_node_list);
+
+ for (int i = 0; i < n_numa_nodes; i++) {
+ while (numa_node_list[i] > last_node &&
+ numa_node_list[i] != OVS_NUMA_UNSPEC &&
+ numa_node_list[i] <= MAX_NUMA_NODES) {
+ if (last_node == 0) {
+ ds_put_format(&dpdk_socket_mem, "%s", "0");
+ } else {
+ ds_put_format(&dpdk_socket_mem, ",%s", "0");
+ }
+ last_node++;
+ }
+ if (numa_node_list[i] == 0) {
+ ds_put_format(&dpdk_socket_mem, "%s", def_value);
+ } else {
+ ds_put_format(&dpdk_socket_mem, ",%s", def_value);
+ }
+ last_node++;
+ }
+ free(numa_node_list);
+ ovs_numa_dump_destroy(dump);
return ds_cstr(&dpdk_socket_mem);
}
diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c
index 6d0a68522..9e3fa5421 100644
--- a/lib/ovs-numa.c
+++ b/lib/ovs-numa.c
@@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa);
* This module stores the affinity information of numa nodes and cpu cores.
* It also provides functions to bookkeep the pin of threads on cpu cores.
*
- * It is assumed that the numa node ids and cpu core ids all start from 0 and
- * range continuously. So, for example, if 'ovs_numa_get_n_cores()' returns N,
- * user can assume core ids from 0 to N-1 are all valid and there is a
- * 'struct cpu_core' for each id.
+ * It is assumed that the numa node ids and cpu core ids all start from 0.
+ * There is no guarantee that node and cpu ids are numbered consecutively
+ * So, for example, if two nodes exist with ids 0 and 8,
+ * 'ovs_numa_get_n_nodes()' will return 2, no assumption of node numbering
+ * should be made.
*
* NOTE, this module should only be used by the main thread.
*
- * NOTE, the assumption above will fail when cpu hotplug is used. In that
- * case ovs-numa will not function correctly. For now, add a TODO entry
- * for addressing it in the future.
+ * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be
+ * invalidated when ever the system topology changes. Support for detecting
+ * topology changes has not been included. For now, add a TODO entry for
+ * addressing it in the future.
*
* TODO: Fix ovs-numa when cpu hotplug is used.
*/
-#define MAX_NUMA_NODES 128
/* numa node. */
struct numa_node {
@@ -130,15 +131,14 @@ insert_new_cpu_core(struct numa_node *n, unsigned core_id)
* - "0,0,0,0": four cores on numa socket 0.
* - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets.
* - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets.
- *
- * The different numa ids must be consecutives or the function will abort. */
+ * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous.
+ */
static void
discover_numa_and_core_dummy(void)
{
char *conf = xstrdup(dummy_config);
char *id, *saveptr = NULL;
unsigned i = 0;
- long max_numa_id = 0;
for (id = strtok_r(conf, ",", &saveptr); id;
id = strtok_r(NULL, ",", &saveptr)) {
@@ -152,8 +152,6 @@ discover_numa_and_core_dummy(void)
continue;
}
- max_numa_id = MAX(max_numa_id, numa_id);
-
hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0));
if (hnode) {
@@ -169,10 +167,27 @@ discover_numa_and_core_dummy(void)
free(conf);
- if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) {
- ovs_fatal(0, "dummy numa contains non consecutive numa ids");
+}
+
+#ifdef __linux__
+/* Check if a CPU is detected and online. */
+static int
+cpu_detected(unsigned int core_id)
+{
+ char path[PATH_MAX];
+ int len = snprintf(path, sizeof(path),
+ "/sys/devices/system/cpu/cpu%d/topology/core_id",
+ core_id);
+ if (len <= 0 || (unsigned) len >= sizeof(path)) {
+ return 0;
+ }
+ if (access(path, F_OK) != 0) {
+ return 0;
}
+
+ return 1;
}
+#endif /* __linux__ */
/* Discovers all numa nodes and the corresponding cpu cores.
* Constructs the 'struct numa_node' and 'struct cpu_core'. */
@@ -219,7 +234,9 @@ discover_numa_and_core(void)
unsigned core_id;
core_id = strtoul(subdir->d_name + 3, NULL, 10);
- insert_new_cpu_core(n, core_id);
+ if (cpu_detected(core_id)) {
+ insert_new_cpu_core(n, core_id);
+ }
}
}
closedir(dir);
@@ -229,7 +246,7 @@ discover_numa_and_core(void)
}
free(path);
- if (!dir || !numa_supported) {
+ if (!numa_supported) {
break;
}
}
diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h
index 8f2ea3430..ecc251a7f 100644
--- a/lib/ovs-numa.h
+++ b/lib/ovs-numa.h
@@ -26,6 +26,8 @@
#define OVS_CORE_UNSPEC INT_MAX
#define OVS_NUMA_UNSPEC INT_MAX
+#define MAX_NUMA_NODES 128
+
/* Dump of a list of 'struct ovs_numa_info'. */
struct ovs_numa_dump {
struct hmap cores;
diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at
index 16402ebae..53eee185a 100644
--- a/tests/dpif-netdev.at
+++ b/tests/dpif-netdev.at
@@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE],
fail-mode=secure -- \
add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \
add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [],
- [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])])
+ [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,8,8,8,8"], [])])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=normal])
diff --git a/tests/pmd.at b/tests/pmd.at
index e0d54c1dd..9c5824c55 100644
--- a/tests/pmd.at
+++ b/tests/pmd.at
@@ -361,8 +361,8 @@ AT_SETUP([PMD - change numa node])
OVS_VSWITCHD_START(
[add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \
add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \
- set Open_vSwitch . other_config:pmd-cpu-mask=3
-], [], [], [--dummy-numa 0,1])
+ set Open_vSwitch . other_config:pmd-cpu-mask=7
+], [], [], [--dummy-numa 0,1,8])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 action=controller])
@@ -432,6 +432,40 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l
icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
])
+AT_CHECK([ovs-vsctl set Interface p1 options:numa_id=8])
+
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 2
+p1 1 8 2
+p2 0 1 1
+p2 1 1 1
+])
+
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
+
+AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered)
+icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
+])
+
+AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log])
+
+AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)'])
+
+OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2])
+OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit])
+
+AT_CHECK([cat ofctl_monitor.log], [0], [dnl
+NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered)
+icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc
+])
+
+
OVS_VSWITCHD_STOP
AT_CLEANUP
@@ -584,7 +618,7 @@ AT_CLEANUP
AT_SETUP([PMD - rxq affinity - NUMA])
OVS_VSWITCHD_START(
- [], [], [], [--dummy-numa 0,0,0,1,1])
+ [], [], [], [--dummy-numa 0,0,0,1,1,8,8])
AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg])
AT_CHECK([ovs-ofctl add-flow br0 actions=controller])
@@ -601,21 +635,38 @@ p1 1 0 2
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"])
-dnl We moved the queues to different numa node. Expecting threads on
+dnl We moved the queues to different contiguous numa node. Expecting threads on
dnl NUMA node 1 to be created.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 1 3
p1 1 1 4
])
+AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:6"])
+
+dnl We moved the queues to different non-contiguous numa node. Expecting threads on
+dnl NUMA node 8 to be created.
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 5
+p1 1 8 6
+])
+
AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"])
-dnl Queues splitted between NUMA nodes.
+dnl Queues splitted between contiguous NUMA nodes.
AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
p1 0 1 3
p1 1 0 1
])
+AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:1"])
+
+dnl Queues splitted between non-contiguous NUMA nodes.
+AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl
+p1 0 8 5
+p1 1 0 1
+])
+
AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity])
dnl We removed the rxq-affinity request. dpif-netdev should assign queues