summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorVishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>2020-05-22 10:50:05 +0200
committerIlya Maximets <i.maximets@ovn.org>2020-06-22 13:11:51 +0200
commit9df65060cf4c27553ee5e29f74ef6807dd5af992 (patch)
tree45cd3f9870c75d5330a9a73ceb7da87c8779e971 /lib
parent1fe42975639854bc6cf4784b2554b438301c0b92 (diff)
downloadopenvswitch-9df65060cf4c27553ee5e29f74ef6807dd5af992.tar.gz
userspace: Avoid dp_hash recirculation for balance-tcp bond mode.
Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Diffstat (limited to 'lib')
-rw-r--r--lib/dpif-netdev.c425
-rw-r--r--lib/dpif-netlink.c3
-rw-r--r--lib/dpif-provider.h12
-rw-r--r--lib/dpif.c39
-rw-r--r--lib/dpif.h12
-rw-r--r--lib/odp-execute.c2
-rw-r--r--lib/odp-util.c14
7 files changed, 463 insertions, 44 deletions
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index 51c888501..1086efd47 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -111,6 +111,7 @@ COVERAGE_DEFINE(datapath_drop_tunnel_push_error);
COVERAGE_DEFINE(datapath_drop_tunnel_pop_error);
COVERAGE_DEFINE(datapath_drop_recirc_error);
COVERAGE_DEFINE(datapath_drop_invalid_port);
+COVERAGE_DEFINE(datapath_drop_invalid_bond);
COVERAGE_DEFINE(datapath_drop_invalid_tnl_port);
COVERAGE_DEFINE(datapath_drop_rx_invalid_packet);
@@ -310,6 +311,7 @@ struct pmd_auto_lb {
*
* dp_netdev_mutex (global)
* port_mutex
+ * bond_mutex
* non_pmd_mutex
*/
struct dp_netdev {
@@ -377,6 +379,10 @@ struct dp_netdev {
struct conntrack *conntrack;
struct pmd_auto_lb pmd_alb;
+
+ /* Bonds. */
+ struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
+ struct cmap tx_bonds; /* Contains 'struct tx_bond'. */
};
static void meter_lock(const struct dp_netdev *dp, uint32_t meter_id)
@@ -608,6 +614,20 @@ struct tx_port {
struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
};
+/* Contained by struct tx_bond 'slave_buckets'. */
+struct slave_entry {
+ odp_port_t slave_id;
+ atomic_ullong n_packets;
+ atomic_ullong n_bytes;
+};
+
+/* Contained by struct dp_netdev_pmd_thread's 'tx_bonds'. */
+struct tx_bond {
+ struct cmap_node node;
+ uint32_t bond_id;
+ struct slave_entry slave_buckets[BOND_BUCKETS];
+};
+
/* A set of properties for the current processing loop that is not directly
* associated with the pmd thread itself, but with the packets being
* processed or the short-term system configuration (for example, time).
@@ -740,6 +760,11 @@ struct dp_netdev_pmd_thread {
* read by the pmd thread. */
struct hmap tx_ports OVS_GUARDED;
+ struct ovs_mutex bond_mutex; /* Protects updates of 'tx_bonds'. */
+ /* Map of 'tx_bond's used for transmission. Written by the main thread
+ * and read by the pmd thread. */
+ struct cmap tx_bonds;
+
/* These are thread-local copies of 'tx_ports'. One contains only tunnel
* ports (that support push_tunnel/pop_tunnel), the other contains ports
* with at least one txq (that support send). A port can be in both.
@@ -831,6 +856,12 @@ static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
static int
dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
bool force);
+static void dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
+ struct tx_bond *bond, bool update)
+ OVS_EXCLUDED(pmd->bond_mutex);
+static void dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
+ uint32_t bond_id)
+ OVS_EXCLUDED(pmd->bond_mutex);
static void reconfigure_datapath(struct dp_netdev *dp)
OVS_REQUIRES(dp->port_mutex);
@@ -1397,6 +1428,49 @@ pmd_perf_show_cmd(struct unixctl_conn *conn, int argc,
par.command_type = PMD_INFO_PERF_SHOW;
dpif_netdev_pmd_info(conn, argc, argv, &par);
}
+
+static void
+dpif_netdev_bond_show(struct unixctl_conn *conn, int argc,
+ const char *argv[], void *aux OVS_UNUSED)
+{
+ struct ds reply = DS_EMPTY_INITIALIZER;
+ struct dp_netdev *dp = NULL;
+
+ ovs_mutex_lock(&dp_netdev_mutex);
+ if (argc == 2) {
+ dp = shash_find_data(&dp_netdevs, argv[1]);
+ } else if (shash_count(&dp_netdevs) == 1) {
+ /* There's only one datapath. */
+ dp = shash_first(&dp_netdevs)->data;
+ }
+ if (!dp) {
+ ovs_mutex_unlock(&dp_netdev_mutex);
+ unixctl_command_reply_error(conn,
+ "please specify an existing datapath");
+ return;
+ }
+
+ if (cmap_count(&dp->tx_bonds) > 0) {
+ struct tx_bond *dp_bond_entry;
+ uint32_t slave_id;
+
+ ds_put_cstr(&reply, "Bonds:\n");
+ CMAP_FOR_EACH (dp_bond_entry, node, &dp->tx_bonds) {
+ ds_put_format(&reply, " bond-id %"PRIu32":\n",
+ dp_bond_entry->bond_id);
+ for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
+ slave_id =
+ odp_to_u32(dp_bond_entry->slave_buckets[bucket].slave_id);
+ ds_put_format(&reply, " bucket %d - slave %"PRIu32"\n",
+ bucket, slave_id);
+ }
+ }
+ }
+ ovs_mutex_unlock(&dp_netdev_mutex);
+ unixctl_command_reply(conn, ds_cstr(&reply));
+ ds_destroy(&reply);
+}
+
static int
dpif_netdev_init(void)
@@ -1428,6 +1502,9 @@ dpif_netdev_init(void)
"[-us usec] [-q qlen]",
0, 10, pmd_perf_log_set_cmd,
NULL);
+ unixctl_command_register("dpif-netdev/bond-show", "[dp]",
+ 0, 1, dpif_netdev_bond_show,
+ NULL);
return 0;
}
@@ -1552,6 +1629,9 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
ovs_mutex_init_recursive(&dp->port_mutex);
hmap_init(&dp->ports);
dp->port_seq = seq_create();
+ ovs_mutex_init(&dp->bond_mutex);
+ cmap_init(&dp->tx_bonds);
+
fat_rwlock_init(&dp->upcall_rwlock);
dp->reconfigure_seq = seq_create();
@@ -1658,6 +1738,12 @@ dp_delete_meter(struct dp_netdev *dp, uint32_t meter_id)
}
}
+static uint32_t
+hash_bond_id(uint32_t bond_id)
+{
+ return hash_int(bond_id, 0);
+}
+
/* Requires dp_netdev_mutex so that we can't get a new reference to 'dp'
* through the 'dp_netdevs' shash while freeing 'dp'. */
static void
@@ -1665,6 +1751,7 @@ dp_netdev_free(struct dp_netdev *dp)
OVS_REQUIRES(dp_netdev_mutex)
{
struct dp_netdev_port *port, *next;
+ struct tx_bond *bond;
shash_find_and_delete(&dp_netdevs, dp->name);
@@ -1674,6 +1761,13 @@ dp_netdev_free(struct dp_netdev *dp)
}
ovs_mutex_unlock(&dp->port_mutex);
+ ovs_mutex_lock(&dp->bond_mutex);
+ CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
+ cmap_remove(&dp->tx_bonds, &bond->node, hash_bond_id(bond->bond_id));
+ ovsrcu_postpone(free, bond);
+ }
+ ovs_mutex_unlock(&dp->bond_mutex);
+
dp_netdev_destroy_all_pmds(dp, true);
cmap_destroy(&dp->poll_threads);
@@ -1692,6 +1786,9 @@ dp_netdev_free(struct dp_netdev *dp)
hmap_destroy(&dp->ports);
ovs_mutex_destroy(&dp->port_mutex);
+ cmap_destroy(&dp->tx_bonds);
+ ovs_mutex_destroy(&dp->bond_mutex);
+
/* Upcalls must be disabled at this point */
dp_netdev_destroy_upcall_lock(dp);
@@ -4423,6 +4520,20 @@ tx_port_lookup(const struct hmap *hmap, odp_port_t port_no)
return NULL;
}
+static struct tx_bond *
+tx_bond_lookup(const struct cmap *tx_bonds, uint32_t bond_id)
+{
+ uint32_t hash = hash_bond_id(bond_id);
+ struct tx_bond *tx;
+
+ CMAP_FOR_EACH_WITH_HASH (tx, node, hash, tx_bonds) {
+ if (tx->bond_id == bond_id) {
+ return tx;
+ }
+ }
+ return NULL;
+}
+
static int
port_reconfigure(struct dp_netdev_port *port)
{
@@ -5070,14 +5181,22 @@ reconfigure_datapath(struct dp_netdev *dp)
}
}
- /* Add every port to the tx cache of every pmd thread, if it's not
- * there already and if this pmd has at least one rxq to poll. */
+ /* Add every port and bond to the tx port and bond caches of
+ * every pmd thread, if it's not there already and if this pmd
+ * has at least one rxq to poll.
+ */
CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
ovs_mutex_lock(&pmd->port_mutex);
if (hmap_count(&pmd->poll_list) || pmd->core_id == NON_PMD_CORE_ID) {
+ struct tx_bond *bond;
+
HMAP_FOR_EACH (port, node, &dp->ports) {
dp_netdev_add_port_tx_to_pmd(pmd, port);
}
+
+ CMAP_FOR_EACH (bond, node, &dp->tx_bonds) {
+ dp_netdev_add_bond_tx_to_pmd(pmd, bond, false);
+ }
}
ovs_mutex_unlock(&pmd->port_mutex);
}
@@ -6125,6 +6244,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
atomic_init(&pmd->reload, false);
ovs_mutex_init(&pmd->flow_mutex);
ovs_mutex_init(&pmd->port_mutex);
+ ovs_mutex_init(&pmd->bond_mutex);
cmap_init(&pmd->flow_table);
cmap_init(&pmd->classifiers);
pmd->ctx.last_rxq = NULL;
@@ -6135,6 +6255,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
hmap_init(&pmd->tx_ports);
hmap_init(&pmd->tnl_port_cache);
hmap_init(&pmd->send_port_cache);
+ cmap_init(&pmd->tx_bonds);
/* init the 'flow_cache' since there is no
* actual thread created for NON_PMD_CORE_ID. */
if (core_id == NON_PMD_CORE_ID) {
@@ -6155,6 +6276,7 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
hmap_destroy(&pmd->send_port_cache);
hmap_destroy(&pmd->tnl_port_cache);
hmap_destroy(&pmd->tx_ports);
+ cmap_destroy(&pmd->tx_bonds);
hmap_destroy(&pmd->poll_list);
/* All flows (including their dpcls_rules) have been deleted already */
CMAP_FOR_EACH (cls, node, &pmd->classifiers) {
@@ -6166,6 +6288,7 @@ dp_netdev_destroy_pmd(struct dp_netdev_pmd_thread *pmd)
ovs_mutex_destroy(&pmd->flow_mutex);
seq_destroy(pmd->reload_seq);
ovs_mutex_destroy(&pmd->port_mutex);
+ ovs_mutex_destroy(&pmd->bond_mutex);
free(pmd);
}
@@ -6235,6 +6358,7 @@ dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
{
struct rxq_poll *poll;
struct tx_port *port;
+ struct tx_bond *tx;
ovs_mutex_lock(&pmd->port_mutex);
HMAP_FOR_EACH_POP (poll, node, &pmd->poll_list) {
@@ -6244,6 +6368,13 @@ dp_netdev_pmd_clear_ports(struct dp_netdev_pmd_thread *pmd)
free(port);
}
ovs_mutex_unlock(&pmd->port_mutex);
+
+ ovs_mutex_lock(&pmd->bond_mutex);
+ CMAP_FOR_EACH (tx, node, &pmd->tx_bonds) {
+ cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
+ ovsrcu_postpone(free, tx);
+ }
+ ovs_mutex_unlock(&pmd->bond_mutex);
}
/* Adds rx queue to poll_list of PMD thread, if it's not there already. */
@@ -6319,6 +6450,62 @@ dp_netdev_del_port_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
free(tx);
pmd->need_reload = true;
}
+
+/* Add bond to the tx bond cmap of 'pmd'. */
+static void
+dp_netdev_add_bond_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
+ struct tx_bond *bond, bool update)
+ OVS_EXCLUDED(pmd->bond_mutex)
+{
+ struct tx_bond *tx;
+
+ ovs_mutex_lock(&pmd->bond_mutex);
+ tx = tx_bond_lookup(&pmd->tx_bonds, bond->bond_id);
+
+ if (tx && !update) {
+ /* It's not an update and the entry already exists. Do nothing. */
+ goto unlock;
+ }
+
+ if (tx) {
+ struct tx_bond *new_tx = xmemdup(bond, sizeof *bond);
+
+ /* Copy the stats for each bucket. */
+ for (int i = 0; i < BOND_BUCKETS; i++) {
+ uint64_t n_packets, n_bytes;
+
+ atomic_read_relaxed(&tx->slave_buckets[i].n_packets, &n_packets);
+ atomic_read_relaxed(&tx->slave_buckets[i].n_bytes, &n_bytes);
+ atomic_init(&new_tx->slave_buckets[i].n_packets, n_packets);
+ atomic_init(&new_tx->slave_buckets[i].n_bytes, n_bytes);
+ }
+ cmap_replace(&pmd->tx_bonds, &tx->node, &new_tx->node,
+ hash_bond_id(bond->bond_id));
+ ovsrcu_postpone(free, tx);
+ } else {
+ tx = xmemdup(bond, sizeof *bond);
+ cmap_insert(&pmd->tx_bonds, &tx->node, hash_bond_id(bond->bond_id));
+ }
+unlock:
+ ovs_mutex_unlock(&pmd->bond_mutex);
+}
+
+/* Delete bond from the tx bond cmap of 'pmd'. */
+static void
+dp_netdev_del_bond_tx_from_pmd(struct dp_netdev_pmd_thread *pmd,
+ uint32_t bond_id)
+ OVS_EXCLUDED(pmd->bond_mutex)
+{
+ struct tx_bond *tx;
+
+ ovs_mutex_lock(&pmd->bond_mutex);
+ tx = tx_bond_lookup(&pmd->tx_bonds, bond_id);
+ if (tx) {
+ cmap_remove(&pmd->tx_bonds, &tx->node, hash_bond_id(tx->bond_id));
+ ovsrcu_postpone(free, tx);
+ }
+ ovs_mutex_unlock(&pmd->bond_mutex);
+}
static char *
dpif_netdev_get_datapath_version(void)
@@ -7144,6 +7331,96 @@ dp_execute_userspace_action(struct dp_netdev_pmd_thread *pmd,
}
}
+static bool
+dp_execute_output_action(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets_,
+ bool should_steal, odp_port_t port_no)
+{
+ struct tx_port *p = pmd_send_port_cache_lookup(pmd, port_no);
+ struct dp_packet_batch out;
+
+ if (!OVS_LIKELY(p)) {
+ COVERAGE_ADD(datapath_drop_invalid_port,
+ dp_packet_batch_size(packets_));
+ dp_packet_delete_batch(packets_, should_steal);
+ return false;
+ }
+ if (!should_steal) {
+ dp_packet_batch_clone(&out, packets_);
+ dp_packet_batch_reset_cutlen(packets_);
+ packets_ = &out;
+ }
+ dp_packet_batch_apply_cutlen(packets_);
+#ifdef DPDK_NETDEV
+ if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
+ && packets_->packets[0]->source
+ != p->output_pkts.packets[0]->source)) {
+ /* XXX: netdev-dpdk assumes that all packets in a single
+ * output batch has the same source. Flush here to
+ * avoid memory access issues. */
+ dp_netdev_pmd_flush_output_on_port(pmd, p);
+ }
+#endif
+ if (dp_packet_batch_size(&p->output_pkts)
+ + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
+ /* Flush here to avoid overflow. */
+ dp_netdev_pmd_flush_output_on_port(pmd, p);
+ }
+ if (dp_packet_batch_is_empty(&p->output_pkts)) {
+ pmd->n_output_batches++;
+ }
+
+ struct dp_packet *packet;
+ DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
+ p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
+ pmd->ctx.last_rxq;
+ dp_packet_batch_add(&p->output_pkts, packet);
+ }
+ return true;
+}
+
+static void
+dp_execute_lb_output_action(struct dp_netdev_pmd_thread *pmd,
+ struct dp_packet_batch *packets_,
+ bool should_steal, uint32_t bond)
+{
+ struct tx_bond *p_bond = tx_bond_lookup(&pmd->tx_bonds, bond);
+ struct dp_packet_batch out;
+ struct dp_packet *packet;
+
+ if (!p_bond) {
+ COVERAGE_ADD(datapath_drop_invalid_bond,
+ dp_packet_batch_size(packets_));
+ dp_packet_delete_batch(packets_, should_steal);
+ return;
+ }
+ if (!should_steal) {
+ dp_packet_batch_clone(&out, packets_);
+ dp_packet_batch_reset_cutlen(packets_);
+ packets_ = &out;
+ }
+ dp_packet_batch_apply_cutlen(packets_);
+
+ DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
+ /*
+ * Lookup the bond-hash table using hash to get the slave.
+ */
+ uint32_t hash = dp_packet_get_rss_hash(packet);
+ struct slave_entry *s_entry = &p_bond->slave_buckets[hash & BOND_MASK];
+ odp_port_t bond_member = s_entry->slave_id;
+ uint32_t size = dp_packet_size(packet);
+ struct dp_packet_batch output_pkt;
+
+ dp_packet_batch_init_packet(&output_pkt, packet);
+ if (OVS_LIKELY(dp_execute_output_action(pmd, &output_pkt, true,
+ bond_member))) {
+ /* Update slave stats. */
+ non_atomic_ullong_add(&s_entry->n_packets, 1);
+ non_atomic_ullong_add(&s_entry->n_bytes, size);
+ }
+ }
+}
+
static void
dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
const struct nlattr *a, bool should_steal)
@@ -7159,49 +7436,14 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
switch ((enum ovs_action_attr)type) {
case OVS_ACTION_ATTR_OUTPUT:
- p = pmd_send_port_cache_lookup(pmd, nl_attr_get_odp_port(a));
- if (OVS_LIKELY(p)) {
- struct dp_packet *packet;
- struct dp_packet_batch out;
-
- if (!should_steal) {
- dp_packet_batch_clone(&out, packets_);
- dp_packet_batch_reset_cutlen(packets_);
- packets_ = &out;
- }
- dp_packet_batch_apply_cutlen(packets_);
-
-#ifdef DPDK_NETDEV
- if (OVS_UNLIKELY(!dp_packet_batch_is_empty(&p->output_pkts)
- && packets_->packets[0]->source
- != p->output_pkts.packets[0]->source)) {
- /* XXX: netdev-dpdk assumes that all packets in a single
- * output batch has the same source. Flush here to
- * avoid memory access issues. */
- dp_netdev_pmd_flush_output_on_port(pmd, p);
- }
-#endif
- if (dp_packet_batch_size(&p->output_pkts)
- + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
- /* Flush here to avoid overflow. */
- dp_netdev_pmd_flush_output_on_port(pmd, p);
- }
-
- if (dp_packet_batch_is_empty(&p->output_pkts)) {
- pmd->n_output_batches++;
- }
+ dp_execute_output_action(pmd, packets_, should_steal,
+ nl_attr_get_odp_port(a));
+ return;
- DP_PACKET_BATCH_FOR_EACH (i, packet, packets_) {
- p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
- pmd->ctx.last_rxq;
- dp_packet_batch_add(&p->output_pkts, packet);
- }
- return;
- } else {
- COVERAGE_ADD(datapath_drop_invalid_port,
- dp_packet_batch_size(packets_));
- }
- break;
+ case OVS_ACTION_ATTR_LB_OUTPUT:
+ dp_execute_lb_output_action(pmd, packets_, should_steal,
+ nl_attr_get_u32(a));
+ return;
case OVS_ACTION_ATTR_TUNNEL_PUSH:
if (should_steal) {
@@ -7813,6 +8055,98 @@ dpif_netdev_ipf_dump_done(struct dpif *dpif OVS_UNUSED, void *ipf_dump_ctx)
}
+static int
+dpif_netdev_bond_add(struct dpif *dpif, uint32_t bond_id,
+ odp_port_t *slave_map)
+{
+ struct tx_bond *new_tx = xzalloc(sizeof *new_tx);
+ struct dp_netdev *dp = get_dp_netdev(dpif);
+ struct dp_netdev_pmd_thread *pmd;
+
+ /* Prepare new bond mapping. */
+ new_tx->bond_id = bond_id;
+ for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
+ new_tx->slave_buckets[bucket].slave_id = slave_map[bucket];
+ }
+
+ ovs_mutex_lock(&dp->bond_mutex);
+ /* Check if bond already existed. */
+ struct tx_bond *old_tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
+ if (old_tx) {
+ cmap_replace(&dp->tx_bonds, &old_tx->node, &new_tx->node,
+ hash_bond_id(bond_id));
+ ovsrcu_postpone(free, old_tx);
+ } else {
+ cmap_insert(&dp->tx_bonds, &new_tx->node, hash_bond_id(bond_id));
+ }
+ ovs_mutex_unlock(&dp->bond_mutex);
+
+ /* Update all PMDs with new bond mapping. */
+ CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+ dp_netdev_add_bond_tx_to_pmd(pmd, new_tx, true);
+ }
+ return 0;
+}
+
+static int
+dpif_netdev_bond_del(struct dpif *dpif, uint32_t bond_id)
+{
+ struct dp_netdev *dp = get_dp_netdev(dpif);
+ struct dp_netdev_pmd_thread *pmd;
+ struct tx_bond *tx;
+
+ ovs_mutex_lock(&dp->bond_mutex);
+ /* Check if bond existed. */
+ tx = tx_bond_lookup(&dp->tx_bonds, bond_id);
+ if (tx) {
+ cmap_remove(&dp->tx_bonds, &tx->node, hash_bond_id(bond_id));
+ ovsrcu_postpone(free, tx);
+ } else {
+ /* Bond is not present. */
+ ovs_mutex_unlock(&dp->bond_mutex);
+ return ENOENT;
+ }
+ ovs_mutex_unlock(&dp->bond_mutex);
+
+ /* Remove the bond map in all pmds. */
+ CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+ dp_netdev_del_bond_tx_from_pmd(pmd, bond_id);
+ }
+ return 0;
+}
+
+static int
+dpif_netdev_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
+ uint64_t *n_bytes)
+{
+ struct dp_netdev *dp = get_dp_netdev(dpif);
+ struct dp_netdev_pmd_thread *pmd;
+
+ if (!tx_bond_lookup(&dp->tx_bonds, bond_id)) {
+ return ENOENT;
+ }
+
+ /* Search the bond in all PMDs. */
+ CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
+ struct tx_bond *pmd_bond_entry
+ = tx_bond_lookup(&pmd->tx_bonds, bond_id);
+
+ if (!pmd_bond_entry) {
+ continue;
+ }
+
+ /* Read bond stats. */
+ for (int i = 0; i < BOND_BUCKETS; i++) {
+ uint64_t pmd_n_bytes;
+
+ atomic_read_relaxed(&pmd_bond_entry->slave_buckets[i].n_bytes,
+ &pmd_n_bytes);
+ n_bytes[i] += pmd_n_bytes;
+ }
+ }
+ return 0;
+}
+
const struct dpif_class dpif_netdev_class = {
"netdev",
true, /* cleanup_required */
@@ -7886,6 +8220,9 @@ const struct dpif_class dpif_netdev_class = {
dpif_netdev_meter_set,
dpif_netdev_meter_get,
dpif_netdev_meter_del,
+ dpif_netdev_bond_add,
+ dpif_netdev_bond_del,
+ dpif_netdev_bond_stats_get,
};
static void
diff --git a/lib/dpif-netlink.c b/lib/dpif-netlink.c
index 1817e9f84..18322e879 100644
--- a/lib/dpif-netlink.c
+++ b/lib/dpif-netlink.c
@@ -4051,6 +4051,9 @@ const struct dpif_class dpif_netlink_class = {
dpif_netlink_meter_set,
dpif_netlink_meter_get,
dpif_netlink_meter_del,
+ NULL, /* bond_add */
+ NULL, /* bond_del */
+ NULL, /* bond_stats_get */
};
static int
diff --git a/lib/dpif-provider.h b/lib/dpif-provider.h
index b77317bca..0e024c1c9 100644
--- a/lib/dpif-provider.h
+++ b/lib/dpif-provider.h
@@ -616,6 +616,18 @@ struct dpif_class {
* zero. */
int (*meter_del)(struct dpif *, ofproto_meter_id meter_id,
struct ofputil_meter_stats *, uint16_t n_bands);
+
+ /* Adds a bond with 'bond_id' and the slave-map to 'dpif'. */
+ int (*bond_add)(struct dpif *dpif, uint32_t bond_id,
+ odp_port_t *slave_map);
+
+ /* Removes bond identified by 'bond_id' from 'dpif'. */
+ int (*bond_del)(struct dpif *dpif, uint32_t bond_id);
+
+ /* Reads bond stats from 'dpif'. 'n_bytes' should be an array with size
+ * sufficient to store BOND_BUCKETS number of elements. */
+ int (*bond_stats_get)(struct dpif *dpif, uint32_t bond_id,
+ uint64_t *n_bytes);
};
extern const struct dpif_class dpif_netlink_class;
diff --git a/lib/dpif.c b/lib/dpif.c
index 9d9c716c1..c529a93f1 100644
--- a/lib/dpif.c
+++ b/lib/dpif.c
@@ -1170,6 +1170,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_,
case OVS_ACTION_ATTR_CT:
case OVS_ACTION_ATTR_OUTPUT:
+ case OVS_ACTION_ATTR_LB_OUTPUT:
case OVS_ACTION_ATTR_TUNNEL_PUSH:
case OVS_ACTION_ATTR_TUNNEL_POP:
case OVS_ACTION_ATTR_USERSPACE:
@@ -1220,6 +1221,7 @@ dpif_execute_helper_cb(void *aux_, struct dp_packet_batch *packets_,
struct dp_packet *clone = NULL;
uint32_t cutlen = dp_packet_get_cutlen(packet);
if (cutlen && (type == OVS_ACTION_ATTR_OUTPUT
+ || type == OVS_ACTION_ATTR_LB_OUTPUT
|| type == OVS_ACTION_ATTR_TUNNEL_PUSH
|| type == OVS_ACTION_ATTR_TUNNEL_POP
|| type == OVS_ACTION_ATTR_USERSPACE)) {
@@ -1879,6 +1881,16 @@ dpif_supports_explicit_drop_action(const struct dpif *dpif)
return dpif_is_netdev(dpif);
}
+bool
+dpif_supports_lb_output_action(const struct dpif *dpif)
+{
+ /*
+ * Balance-tcp optimization is currently supported in netdev
+ * datapath only.
+ */
+ return dpif_is_netdev(dpif);
+}
+
/* Meters */
void
dpif_meter_get_features(const struct dpif *dpif,
@@ -1976,3 +1988,30 @@ dpif_meter_del(struct dpif *dpif, ofproto_meter_id meter_id,
}
return error;
}
+
+int
+dpif_bond_add(struct dpif *dpif, uint32_t bond_id, odp_port_t *slave_map)
+{
+ return dpif->dpif_class->bond_del
+ ? dpif->dpif_class->bond_add(dpif, bond_id, slave_map)
+ : EOPNOTSUPP;
+}
+
+int
+dpif_bond_del(struct dpif *dpif, uint32_t bond_id)
+{
+ return dpif->dpif_class->bond_del
+ ? dpif->dpif_class->bond_del(dpif, bond_id)
+ : EOPNOTSUPP;
+}
+
+int
+dpif_bond_stats_get(struct dpif *dpif, uint32_t bond_id,
+ uint64_t *n_bytes)
+{
+ memset(n_bytes, 0, BOND_BUCKETS * sizeof *n_bytes);
+
+ return dpif->dpif_class->bond_stats_get
+ ? dpif->dpif_class->bond_stats_get(dpif, bond_id, n_bytes)
+ : EOPNOTSUPP;
+}
diff --git a/lib/dpif.h b/lib/dpif.h
index 4df8f7c8b..2d52f0186 100644
--- a/lib/dpif.h
+++ b/lib/dpif.h
@@ -891,6 +891,18 @@ int dpif_meter_get(const struct dpif *, ofproto_meter_id meter_id,
struct ofputil_meter_stats *, uint16_t n_bands);
int dpif_meter_del(struct dpif *, ofproto_meter_id meter_id,
struct ofputil_meter_stats *, uint16_t n_bands);
+
+/* Bonding. */
+
+/* Bit-mask for hashing a flow down to a bucket. */
+#define BOND_MASK 0xff
+#define BOND_BUCKETS (BOND_MASK + 1)
+
+int dpif_bond_add(struct dpif *, uint32_t bond_id, odp_port_t *slave_map);
+int dpif_bond_del(struct dpif *, uint32_t bond_id);
+int dpif_bond_stats_get(struct dpif *, uint32_t bond_id, uint64_t *n_bytes);
+bool dpif_supports_lb_output_action(const struct dpif *);
+
/* Miscellaneous. */
diff --git a/lib/odp-execute.c b/lib/odp-execute.c
index 42d3335f0..6018e378a 100644
--- a/lib/odp-execute.c
+++ b/lib/odp-execute.c
@@ -793,6 +793,7 @@ requires_datapath_assistance(const struct nlattr *a)
switch (type) {
/* These only make sense in the context of a datapath. */
case OVS_ACTION_ATTR_OUTPUT:
+ case OVS_ACTION_ATTR_LB_OUTPUT:
case OVS_ACTION_ATTR_TUNNEL_PUSH:
case OVS_ACTION_ATTR_TUNNEL_POP:
case OVS_ACTION_ATTR_USERSPACE:
@@ -1068,6 +1069,7 @@ odp_execute_actions(void *dp, struct dp_packet_batch *batch, bool steal,
return;
}
case OVS_ACTION_ATTR_OUTPUT:
+ case OVS_ACTION_ATTR_LB_OUTPUT:
case OVS_ACTION_ATTR_TUNNEL_PUSH:
case OVS_ACTION_ATTR_TUNNEL_POP:
case OVS_ACTION_ATTR_USERSPACE:
diff --git a/lib/odp-util.c b/lib/odp-util.c
index e907804aa..011db9ebb 100644
--- a/lib/odp-util.c
+++ b/lib/odp-util.c
@@ -119,6 +119,7 @@ odp_action_len(uint16_t type)
switch ((enum ovs_action_attr) type) {
case OVS_ACTION_ATTR_OUTPUT: return sizeof(uint32_t);
+ case OVS_ACTION_ATTR_LB_OUTPUT: return sizeof(uint32_t);
case OVS_ACTION_ATTR_TRUNC: return sizeof(struct ovs_action_trunc);
case OVS_ACTION_ATTR_TUNNEL_PUSH: return ATTR_LEN_VARIABLE;
case OVS_ACTION_ATTR_TUNNEL_POP: return sizeof(uint32_t);
@@ -1132,6 +1133,9 @@ format_odp_action(struct ds *ds, const struct nlattr *a,
case OVS_ACTION_ATTR_OUTPUT:
odp_portno_name_format(portno_names, nl_attr_get_odp_port(a), ds);
break;
+ case OVS_ACTION_ATTR_LB_OUTPUT:
+ ds_put_format(ds, "lb_output(%"PRIu32")", nl_attr_get_u32(a));
+ break;
case OVS_ACTION_ATTR_TRUNC: {
const struct ovs_action_trunc *trunc =
nl_attr_get_unspec(a, sizeof *trunc);
@@ -2306,6 +2310,16 @@ parse_odp_action__(struct parse_odp_context *context, const char *s,
}
{
+ uint32_t bond_id;
+ int n;
+
+ if (ovs_scan(s, "lb_output(%"PRIu32")%n", &bond_id, &n)) {
+ nl_msg_put_u32(actions, OVS_ACTION_ATTR_LB_OUTPUT, bond_id);
+ return n;
+ }
+ }
+
+ {
uint32_t max_len;
int n;