diff options
author | David Wilder <dwilder@us.ibm.com> | 2021-06-22 11:53:08 -0700 |
---|---|---|
committer | Ilya Maximets <i.maximets@ovn.org> | 2021-07-07 23:35:57 +0200 |
commit | 3da3cc1a0cf281cd671d672bbdb09002938ec14e (patch) | |
tree | 7b33614c1b39f8bb46ebc65722ce4fb7741cf23d | |
parent | 154983c592da8cf4ea6289b8b271176d0ad962b7 (diff) | |
download | openvswitch-3da3cc1a0cf281cd671d672bbdb09002938ec14e.tar.gz |
ovs-numa: Support non-contiguous numa nodes and offline CPU cores.
This change removes the assumption that numa nodes and cores are numbered
contiguously in linux. This change is required to support some Power
systems.
A check has been added to verify that cores are online,
offline cores result in non-contiguously numbered cores.
DPDK EAL option generation is updated to work with non-contiguous numa nodes.
These options can be seen in the ovs-vswitchd.log. For example:
a system containing only numa nodes 0 and 8 will generate the following:
EAL ARGS: ovs-vswitchd --socket-mem 1024,0,0,0,0,0,0,0,1024 \
--socket-limit 1024,0,0,0,0,0,0,0,1024 -l 0
Tests for pmd and dpif-netdev have been updated to validate non-contiguous
numbered nodes.
Signed-off-by: David Wilder <dwilder@us.ibm.com>
Acked-by: Kevin Traynor <ktraynor@redhat.com>
Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
-rw-r--r-- | NEWS | 1 | ||||
-rw-r--r-- | lib/dpdk.c | 57 | ||||
-rw-r--r-- | lib/ovs-numa.c | 51 | ||||
-rw-r--r-- | lib/ovs-numa.h | 2 | ||||
-rw-r--r-- | tests/dpif-netdev.at | 2 | ||||
-rw-r--r-- | tests/pmd.at | 61 |
6 files changed, 143 insertions, 31 deletions
@@ -16,6 +16,7 @@ Post-v2.15.0 * Auto load balancing of PMDs now partially supports cross-NUMA polling cases, e.g if all PMD threads are running on the same NUMA node. * Userspace datapath now supports up to 2^18 meters. + * Added support for systems with non-contiguous NUMA nodes and core ids. - ovs-ctl: * New option '--no-record-hostname' to disable hostname configuration in ovsdb on startup. diff --git a/lib/dpdk.c b/lib/dpdk.c index 2eaaa569c..0c910092c 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -130,22 +130,63 @@ construct_dpdk_options(const struct smap *ovs_other_config, struct svec *args) } } +static int +compare_numa_node_list(const void *a_, const void *b_) +{ + int a = *(const int *) a_; + int b = *(const int *) b_; + + if (a < b) { + return -1; + } + if (a > b) { + return 1; + } + return 0; +} + static char * construct_dpdk_socket_mem(void) { const char *def_value = "1024"; - int numa, numa_nodes = ovs_numa_get_n_numas(); struct ds dpdk_socket_mem = DS_EMPTY_INITIALIZER; - if (numa_nodes == 0 || numa_nodes == OVS_NUMA_UNSPEC) { - numa_nodes = 1; - } + /* Build a list of all numa nodes with at least one core. */ + struct ovs_numa_dump *dump = ovs_numa_dump_n_cores_per_numa(1); + size_t n_numa_nodes = hmap_count(&dump->numas); + int *numa_node_list = xcalloc(n_numa_nodes, sizeof *numa_node_list); - ds_put_cstr(&dpdk_socket_mem, def_value); - for (numa = 1; numa < numa_nodes; ++numa) { - ds_put_format(&dpdk_socket_mem, ",%s", def_value); - } + const struct ovs_numa_info_numa *node; + int k = 0, last_node = 0; + FOR_EACH_NUMA_ON_DUMP(node, dump) { + if (k >= n_numa_nodes) { + break; + } + numa_node_list[k++] = node->numa_id; + } + qsort(numa_node_list, k, sizeof *numa_node_list, compare_numa_node_list); + + for (int i = 0; i < n_numa_nodes; i++) { + while (numa_node_list[i] > last_node && + numa_node_list[i] != OVS_NUMA_UNSPEC && + numa_node_list[i] <= MAX_NUMA_NODES) { + if (last_node == 0) { + ds_put_format(&dpdk_socket_mem, "%s", "0"); + } else { + ds_put_format(&dpdk_socket_mem, ",%s", "0"); + } + last_node++; + } + if (numa_node_list[i] == 0) { + ds_put_format(&dpdk_socket_mem, "%s", def_value); + } else { + ds_put_format(&dpdk_socket_mem, ",%s", def_value); + } + last_node++; + } + free(numa_node_list); + ovs_numa_dump_destroy(dump); return ds_cstr(&dpdk_socket_mem); } diff --git a/lib/ovs-numa.c b/lib/ovs-numa.c index 6d0a68522..9e3fa5421 100644 --- a/lib/ovs-numa.c +++ b/lib/ovs-numa.c @@ -42,21 +42,22 @@ VLOG_DEFINE_THIS_MODULE(ovs_numa); * This module stores the affinity information of numa nodes and cpu cores. * It also provides functions to bookkeep the pin of threads on cpu cores. * - * It is assumed that the numa node ids and cpu core ids all start from 0 and - * range continuously. So, for example, if 'ovs_numa_get_n_cores()' returns N, - * user can assume core ids from 0 to N-1 are all valid and there is a - * 'struct cpu_core' for each id. + * It is assumed that the numa node ids and cpu core ids all start from 0. + * There is no guarantee that node and cpu ids are numbered consecutively + * So, for example, if two nodes exist with ids 0 and 8, + * 'ovs_numa_get_n_nodes()' will return 2, no assumption of node numbering + * should be made. * * NOTE, this module should only be used by the main thread. * - * NOTE, the assumption above will fail when cpu hotplug is used. In that - * case ovs-numa will not function correctly. For now, add a TODO entry - * for addressing it in the future. + * NOTE, if cpu hotplug is used 'all_numa_nodes' and 'all_cpu_cores' must be + * invalidated when ever the system topology changes. Support for detecting + * topology changes has not been included. For now, add a TODO entry for + * addressing it in the future. * * TODO: Fix ovs-numa when cpu hotplug is used. */ -#define MAX_NUMA_NODES 128 /* numa node. */ struct numa_node { @@ -130,15 +131,14 @@ insert_new_cpu_core(struct numa_node *n, unsigned core_id) * - "0,0,0,0": four cores on numa socket 0. * - "0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1": 16 cores on two numa sockets. * - "0,0,0,0,1,1,1,1": 8 cores on two numa sockets. - * - * The different numa ids must be consecutives or the function will abort. */ + * - "0,0,0,0,8,8,8,8": 8 cores on two numa sockets, non-contiguous. + */ static void discover_numa_and_core_dummy(void) { char *conf = xstrdup(dummy_config); char *id, *saveptr = NULL; unsigned i = 0; - long max_numa_id = 0; for (id = strtok_r(conf, ",", &saveptr); id; id = strtok_r(NULL, ",", &saveptr)) { @@ -152,8 +152,6 @@ discover_numa_and_core_dummy(void) continue; } - max_numa_id = MAX(max_numa_id, numa_id); - hnode = hmap_first_with_hash(&all_numa_nodes, hash_int(numa_id, 0)); if (hnode) { @@ -169,10 +167,27 @@ discover_numa_and_core_dummy(void) free(conf); - if (max_numa_id + 1 != hmap_count(&all_numa_nodes)) { - ovs_fatal(0, "dummy numa contains non consecutive numa ids"); +} + +#ifdef __linux__ +/* Check if a CPU is detected and online. */ +static int +cpu_detected(unsigned int core_id) +{ + char path[PATH_MAX]; + int len = snprintf(path, sizeof(path), + "/sys/devices/system/cpu/cpu%d/topology/core_id", + core_id); + if (len <= 0 || (unsigned) len >= sizeof(path)) { + return 0; + } + if (access(path, F_OK) != 0) { + return 0; } + + return 1; } +#endif /* __linux__ */ /* Discovers all numa nodes and the corresponding cpu cores. * Constructs the 'struct numa_node' and 'struct cpu_core'. */ @@ -219,7 +234,9 @@ discover_numa_and_core(void) unsigned core_id; core_id = strtoul(subdir->d_name + 3, NULL, 10); - insert_new_cpu_core(n, core_id); + if (cpu_detected(core_id)) { + insert_new_cpu_core(n, core_id); + } } } closedir(dir); @@ -229,7 +246,7 @@ discover_numa_and_core(void) } free(path); - if (!dir || !numa_supported) { + if (!numa_supported) { break; } } diff --git a/lib/ovs-numa.h b/lib/ovs-numa.h index 8f2ea3430..ecc251a7f 100644 --- a/lib/ovs-numa.h +++ b/lib/ovs-numa.h @@ -26,6 +26,8 @@ #define OVS_CORE_UNSPEC INT_MAX #define OVS_NUMA_UNSPEC INT_MAX +#define MAX_NUMA_NODES 128 + /* Dump of a list of 'struct ovs_numa_info'. */ struct ovs_numa_dump { struct hmap cores; diff --git a/tests/dpif-netdev.at b/tests/dpif-netdev.at index 16402ebae..53eee185a 100644 --- a/tests/dpif-netdev.at +++ b/tests/dpif-netdev.at @@ -98,7 +98,7 @@ m4_define([DPIF_NETDEV_DUMMY_IFACE], fail-mode=secure -- \ add-port br1 p2 -- set interface p2 type=$1 options:stream=unix:$OVS_RUNDIR/p0.sock ofport_request=2 -- \ add-port br1 p8 -- set interface p8 ofport_request=8 type=$1 --], [], [], - [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,1,1,1,1"], [])]) + [m4_if([$1], [dummy-pmd], [--dummy-numa="0,0,0,0,8,8,8,8"], [])]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-ofctl add-flow br0 action=normal]) diff --git a/tests/pmd.at b/tests/pmd.at index e0d54c1dd..9c5824c55 100644 --- a/tests/pmd.at +++ b/tests/pmd.at @@ -361,8 +361,8 @@ AT_SETUP([PMD - change numa node]) OVS_VSWITCHD_START( [add-port br0 p1 -- set Interface p1 type=dummy-pmd ofport_request=1 options:n_rxq=2 -- \ add-port br0 p2 -- set Interface p2 type=dummy-pmd ofport_request=2 options:n_rxq=2 -- \ - set Open_vSwitch . other_config:pmd-cpu-mask=3 -], [], [], [--dummy-numa 0,1]) + set Open_vSwitch . other_config:pmd-cpu-mask=7 +], [], [], [--dummy-numa 0,1,8]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-ofctl add-flow br0 action=controller]) @@ -432,6 +432,40 @@ NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_l icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc ]) +AT_CHECK([ovs-vsctl set Interface p1 options:numa_id=8]) + +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl +p1 0 8 2 +p1 1 8 2 +p2 0 1 1 +p2 1 1 1 +]) + +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p1 --qid 1 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2]) +OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit]) + +AT_CHECK([cat ofctl_monitor.log], [0], [dnl +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=1 (via action) data_len=106 (unbuffered) +icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc +]) + +AT_CHECK([ovs-ofctl monitor br0 65534 invalid_ttl --detach --no-chdir --pidfile 2> ofctl_monitor.log]) + +AT_CHECK([ovs-appctl netdev-dummy/receive p2 --qid 0 'in_port(1),eth(src=50:54:00:00:00:09,dst=50:54:00:00:00:0a),eth_type(0x0800),ipv4(src=10.0.0.2,dst=10.0.0.1,proto=1,tos=0,ttl=64,frag=no),icmp(type=8,code=0)']) + +OVS_WAIT_UNTIL([test `wc -l < ofctl_monitor.log` -ge 2]) +OVS_WAIT_UNTIL([ovs-appctl -t ovs-ofctl exit]) + +AT_CHECK([cat ofctl_monitor.log], [0], [dnl +NXT_PACKET_IN2 (xid=0x0): cookie=0x0 total_len=106 in_port=2 (via action) data_len=106 (unbuffered) +icmp,vlan_tci=0x0000,dl_src=50:54:00:00:00:09,dl_dst=50:54:00:00:00:0a,nw_src=10.0.0.2,nw_dst=10.0.0.1,nw_tos=0,nw_ecn=0,nw_ttl=64,icmp_type=8,icmp_code=0 icmp_csum:13fc +]) + + OVS_VSWITCHD_STOP AT_CLEANUP @@ -584,7 +618,7 @@ AT_CLEANUP AT_SETUP([PMD - rxq affinity - NUMA]) OVS_VSWITCHD_START( - [], [], [], [--dummy-numa 0,0,0,1,1]) + [], [], [], [--dummy-numa 0,0,0,1,1,8,8]) AT_CHECK([ovs-appctl vlog/set dpif:dbg dpif_netdev:dbg]) AT_CHECK([ovs-ofctl add-flow br0 actions=controller]) @@ -601,21 +635,38 @@ p1 1 0 2 AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:4"]) -dnl We moved the queues to different numa node. Expecting threads on +dnl We moved the queues to different contiguous numa node. Expecting threads on dnl NUMA node 1 to be created. AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl p1 0 1 3 p1 1 1 4 ]) +AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:6"]) + +dnl We moved the queues to different non-contiguous numa node. Expecting threads on +dnl NUMA node 8 to be created. +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl +p1 0 8 5 +p1 1 8 6 +]) + AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:3,1:1"]) -dnl Queues splitted between NUMA nodes. +dnl Queues splitted between contiguous NUMA nodes. AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl p1 0 1 3 p1 1 0 1 ]) +AT_CHECK([ovs-vsctl set Interface p1 other_config:pmd-rxq-affinity="0:5,1:1"]) + +dnl Queues splitted between non-contiguous NUMA nodes. +AT_CHECK([ovs-appctl dpif-netdev/pmd-rxq-show | parse_pmd_rxq_show], [0], [dnl +p1 0 8 5 +p1 1 0 1 +]) + AT_CHECK([ovs-vsctl remove Interface p1 other_config pmd-rxq-affinity]) dnl We removed the rxq-affinity request. dpif-netdev should assign queues |