summaryrefslogtreecommitdiff
path: root/ofproto
diff options
context:
space:
mode:
authorVishal Deep Ajmera <vishal.deep.ajmera@ericsson.com>2020-05-22 10:50:05 +0200
committerIlya Maximets <i.maximets@ovn.org>2020-06-22 13:11:51 +0200
commit9df65060cf4c27553ee5e29f74ef6807dd5af992 (patch)
tree45cd3f9870c75d5330a9a73ceb7da87c8779e971 /ofproto
parent1fe42975639854bc6cf4784b2554b438301c0b92 (diff)
downloadopenvswitch-9df65060cf4c27553ee5e29f74ef6807dd5af992.tar.gz
userspace: Avoid dp_hash recirculation for balance-tcp bond mode.
Problem: In OVS, flows with output over a bond interface of type “balance-tcp” gets translated by the ofproto layer into "HASH" and "RECIRC" datapath actions. After recirculation, the packet is forwarded to the bond member port based on 8-bits of the datapath hash value computed through dp_hash. This causes performance degradation in the following ways: 1. The recirculation of the packet implies another lookup of the packet’s flow key in the exact match cache (EMC) and potentially Megaflow classifier (DPCLS). This is the biggest cost factor. 2. The recirculated packets have a new “RSS” hash and compete with the original packets for the scarce number of EMC slots. This implies more EMC misses and potentially EMC thrashing causing costly DPCLS lookups. 3. The 256 extra megaflow entries per bond for dp_hash bond selection put additional load on the revalidation threads. Owing to this performance degradation, deployments stick to “balance-slb” bond mode even though it does not do active-active load balancing for VXLAN- and GRE-tunnelled traffic because all tunnel packet have the same source MAC address. Proposed optimization: This proposal introduces a new load-balancing output action instead of recirculation. Maintain one table per-bond (could just be an array of uint16's) and program it the same way internal flows are created today for each possible hash value (256 entries) from ofproto layer. Use this table to load-balance flows as part of output action processing. Currently xlate_normal() -> output_normal() -> bond_update_post_recirc_rules() -> bond_may_recirc() and compose_output_action__() generate 'dp_hash(hash_l4(0))' and 'recirc(<RecircID>)' actions. In this case the RecircID identifies the bond. For the recirculated packets the ofproto layer installs megaflow entries that match on RecircID and masked dp_hash and send them to the corresponding output port. Instead, we will now generate action as 'lb_output(<bond id>)' This combines hash computation (only if needed, else re-use RSS hash) and inline load-balancing over the bond. This action is used *only* for balance-tcp bonds in userspace datapath (the OVS kernel datapath remains unchanged). Example: Current scheme: With 8 UDP flows (with random UDP src port): flow-dump from pmd on cpu core: 2 recirc_id(0),in_port(7),<...> actions:hash(hash_l4(0)),recirc(0x1) recirc_id(0x1),dp_hash(0xf8e02b7e/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb236c260/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x7d89eb18/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0xa78d75df/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0xb58d846f/0xff),<...> actions:2 recirc_id(0x1),dp_hash(0x24534406/0xff),<...> actions:1 recirc_id(0x1),dp_hash(0x3cf32550/0xff),<...> actions:1 New scheme: We can do with a single flow entry (for any number of new flows): in_port(7),<...> actions:lb_output(1) A new CLI has been added to dump datapath bond cache as given below. # ovs-appctl dpif-netdev/bond-show [dp] Bond cache: bond-id 1 : bucket 0 - slave 2 bucket 1 - slave 1 bucket 2 - slave 2 bucket 3 - slave 1 Co-authored-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Manohar Krishnappa Chidambaraswamy <manukc@gmail.com> Signed-off-by: Vishal Deep Ajmera <vishal.deep.ajmera@ericsson.com> Tested-by: Matteo Croce <mcroce@redhat.com> Tested-by: Adrian Moreno <amorenoz@redhat.com> Acked-by: Eelco Chaudron <echaudro@redhat.com> Signed-off-by: Ilya Maximets <i.maximets@ovn.org>
Diffstat (limited to 'ofproto')
-rw-r--r--ofproto/bond.c111
-rw-r--r--ofproto/bond.h5
-rw-r--r--ofproto/ofproto-dpif-ipfix.c1
-rw-r--r--ofproto/ofproto-dpif-sflow.c3
-rw-r--r--ofproto/ofproto-dpif-xlate.c15
-rw-r--r--ofproto/ofproto-dpif.c30
-rw-r--r--ofproto/ofproto-dpif.h10
7 files changed, 153 insertions, 22 deletions
diff --git a/ofproto/bond.c b/ofproto/bond.c
index 405202fb6..9947e7531 100644
--- a/ofproto/bond.c
+++ b/ofproto/bond.c
@@ -54,10 +54,6 @@ static struct ovs_rwlock rwlock = OVS_RWLOCK_INITIALIZER;
static struct hmap all_bonds__ = HMAP_INITIALIZER(&all_bonds__);
static struct hmap *const all_bonds OVS_GUARDED_BY(rwlock) = &all_bonds__;
-/* Bit-mask for hashing a flow down to a bucket. */
-#define BOND_MASK 0xff
-#define BOND_BUCKETS (BOND_MASK + 1)
-
/* Priority for internal rules created to handle recirculation */
#define RECIRC_RULE_PRIORITY 20
@@ -126,6 +122,8 @@ struct bond {
enum lacp_status lacp_status; /* Status of LACP negotiations. */
bool bond_revalidate; /* True if flows need revalidation. */
uint32_t basis; /* Basis for flow hash function. */
+ bool use_lb_output_action; /* Use lb_output action to avoid recirculation.
+ Applicable only for Balance TCP mode. */
/* SLB specific bonding info. */
struct bond_entry *hash; /* An array of BOND_BUCKETS elements. */
@@ -185,8 +183,10 @@ static struct bond_slave *choose_output_slave(const struct bond *,
struct flow_wildcards *,
uint16_t vlan)
OVS_REQ_RDLOCK(rwlock);
-static void update_recirc_rules__(struct bond *bond);
+static void update_recirc_rules__(struct bond *);
static bool bond_is_falling_back_to_ab(const struct bond *);
+static void bond_add_lb_output_buckets(const struct bond *);
+static void bond_del_lb_output_buckets(const struct bond *);
/* Attempts to parse 's' as the name of a bond balancing mode. If successful,
* stores the mode in '*balance' and returns true. Otherwise returns false
@@ -282,6 +282,10 @@ bond_unref(struct bond *bond)
/* Free bond resources. Remove existing post recirc rules. */
if (bond->recirc_id) {
+ if (bond_use_lb_output_action(bond)) {
+ /* Delete bond buckets from datapath if installed. */
+ bond_del_lb_output_buckets(bond);
+ }
recirc_free_id(bond->recirc_id);
bond->recirc_id = 0;
}
@@ -336,27 +340,35 @@ update_recirc_rules__(struct bond *bond)
struct ofpbuf ofpacts;
int i;
- ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
-
HMAP_FOR_EACH(pr_op, hmap_node, &bond->pr_rule_ops) {
pr_op->op = DEL;
}
if (bond->hash && bond->recirc_id) {
- for (i = 0; i < BOND_BUCKETS; i++) {
- struct bond_slave *slave = bond->hash[i].slave;
+ if (bond_use_lb_output_action(bond)) {
+ bond_add_lb_output_buckets(bond);
+ /* No need to install post recirculation rules as we are using
+ * lb_output action with bond buckets.
+ */
+ return;
+ } else {
+ for (i = 0; i < BOND_BUCKETS; i++) {
+ struct bond_slave *slave = bond->hash[i].slave;
- if (slave) {
- match_init_catchall(&match);
- match_set_recirc_id(&match, bond->recirc_id);
- match_set_dp_hash_masked(&match, i, BOND_MASK);
+ if (slave) {
+ match_init_catchall(&match);
+ match_set_recirc_id(&match, bond->recirc_id);
+ match_set_dp_hash_masked(&match, i, BOND_MASK);
- add_pr_rule(bond, &match, slave->ofp_port,
- &bond->hash[i].pr_rule);
+ add_pr_rule(bond, &match, slave->ofp_port,
+ &bond->hash[i].pr_rule);
+ }
}
}
}
+ ofpbuf_use_stub(&ofpacts, ofpacts_stub, sizeof ofpacts_stub);
+
HMAP_FOR_EACH_SAFE(pr_op, next_op, hmap_node, &bond->pr_rule_ops) {
int error;
switch (pr_op->op) {
@@ -464,9 +476,23 @@ bond_reconfigure(struct bond *bond, const struct bond_settings *s)
bond->recirc_id = recirc_alloc_id(bond->ofproto);
}
} else if (bond->recirc_id) {
+ if (bond_use_lb_output_action(bond)) {
+ /* Delete bond buckets from datapath if installed. */
+ bond_del_lb_output_buckets(bond);
+ }
recirc_free_id(bond->recirc_id);
bond->recirc_id = 0;
}
+ if (bond->use_lb_output_action != s->use_lb_output_action) {
+ if (s->use_lb_output_action &&
+ !ovs_lb_output_action_supported(bond->ofproto)) {
+ VLOG_WARN("%s: Datapath does not support 'lb_output' action, "
+ "disabled.", bond->name);
+ } else {
+ bond->use_lb_output_action = s->use_lb_output_action;
+ revalidate = true;
+ }
+ }
if (bond->balance == BM_AB || !bond->hash || revalidate) {
bond_entry_reset(bond);
@@ -944,19 +970,31 @@ bond_recirculation_account(struct bond *bond)
OVS_REQ_WRLOCK(rwlock)
{
int i;
+ uint64_t n_bytes[BOND_BUCKETS];
+ bool use_lb_output_action = bond_use_lb_output_action(bond);
+
+ if (use_lb_output_action) {
+ /* Retrieve bond stats from datapath. */
+ dpif_bond_stats_get(bond->ofproto->backer->dpif,
+ bond->recirc_id, n_bytes);
+ }
for (i=0; i<=BOND_MASK; i++) {
struct bond_entry *entry = &bond->hash[i];
struct rule *rule = entry->pr_rule;
+ struct pkt_stats stats;
- if (rule) {
- struct pkt_stats stats;
+ if (use_lb_output_action) {
+ stats.n_bytes = n_bytes[i];
+ } else if (rule) {
long long int used OVS_UNUSED;
rule->ofproto->ofproto_class->rule_get_stats(
rule, &stats, &used);
- bond_entry_account(entry, stats.n_bytes);
+ } else {
+ continue;
}
+ bond_entry_account(entry, stats.n_bytes);
}
}
@@ -1351,6 +1389,7 @@ bond_print_details(struct ds *ds, const struct bond *bond)
struct shash slave_shash = SHASH_INITIALIZER(&slave_shash);
const struct shash_node **sorted_slaves = NULL;
const struct bond_slave *slave;
+ bool use_lb_output_action;
bool may_recirc;
uint32_t recirc_id;
int i;
@@ -1366,6 +1405,11 @@ bond_print_details(struct ds *ds, const struct bond *bond)
ds_put_format(ds, "bond-hash-basis: %"PRIu32"\n", bond->basis);
+ use_lb_output_action = bond_use_lb_output_action(bond);
+ ds_put_format(ds, "lb_output action: %s, bond-id: %d\n",
+ use_lb_output_action ? "enabled" : "disabled",
+ use_lb_output_action ? recirc_id : -1);
+
ds_put_format(ds, "updelay: %d ms\n", bond->updelay);
ds_put_format(ds, "downdelay: %d ms\n", bond->downdelay);
@@ -1942,3 +1986,34 @@ bond_get_changed_active_slave(const char *name, struct eth_addr *mac,
return false;
}
+
+bool
+bond_use_lb_output_action(const struct bond *bond)
+{
+ return bond_may_recirc(bond) && bond->use_lb_output_action;
+}
+
+static void
+bond_add_lb_output_buckets(const struct bond *bond)
+{
+ ofp_port_t slave_map[BOND_BUCKETS];
+
+ for (int i = 0; i < BOND_BUCKETS; i++) {
+ struct bond_slave *slave = bond->hash[i].slave;
+
+ if (slave) {
+ slave_map[i] = slave->ofp_port;
+ } else {
+ slave_map[i] = OFPP_NONE;
+ }
+ }
+ ofproto_dpif_add_lb_output_buckets(bond->ofproto, bond->recirc_id,
+ slave_map);
+}
+
+static void
+bond_del_lb_output_buckets(const struct bond *bond)
+{
+ ofproto_dpif_delete_lb_output_buckets(bond->ofproto,
+ bond->recirc_id);
+}
diff --git a/ofproto/bond.h b/ofproto/bond.h
index e7c3d9bc3..40c3258dc 100644
--- a/ofproto/bond.h
+++ b/ofproto/bond.h
@@ -58,6 +58,8 @@ struct bond_settings {
/* The MAC address of the interface
that was active during the last
ovs run. */
+ bool use_lb_output_action; /* Use lb_output action. Only applicable for
+ bond mode BALANCE TCP. */
};
/* Program startup. */
@@ -122,4 +124,7 @@ void bond_rebalance(struct bond *);
*/
void bond_update_post_recirc_rules(struct bond *, uint32_t *recirc_id,
uint32_t *hash_basis);
+
+bool bond_use_lb_output_action(const struct bond *bond);
+
#endif /* bond.h */
diff --git a/ofproto/ofproto-dpif-ipfix.c b/ofproto/ofproto-dpif-ipfix.c
index b413768ef..796eb6f88 100644
--- a/ofproto/ofproto-dpif-ipfix.c
+++ b/ofproto/ofproto-dpif-ipfix.c
@@ -2979,6 +2979,7 @@ dpif_ipfix_read_actions(const struct flow *flow,
enum ovs_action_attr type = nl_attr_type(a);
switch (type) {
case OVS_ACTION_ATTR_OUTPUT:
+ case OVS_ACTION_ATTR_LB_OUTPUT:
ipfix_actions->output_action = true;
break;
case OVS_ACTION_ATTR_SAMPLE:
diff --git a/ofproto/ofproto-dpif-sflow.c b/ofproto/ofproto-dpif-sflow.c
index f9ea47a2f..f616fb2bb 100644
--- a/ofproto/ofproto-dpif-sflow.c
+++ b/ofproto/ofproto-dpif-sflow.c
@@ -1175,8 +1175,9 @@ dpif_sflow_read_actions(const struct flow *flow,
case OVS_ACTION_ATTR_RECIRC:
case OVS_ACTION_ATTR_HASH:
case OVS_ACTION_ATTR_CT:
- case OVS_ACTION_ATTR_CT_CLEAR:
+ case OVS_ACTION_ATTR_CT_CLEAR:
case OVS_ACTION_ATTR_METER:
+ case OVS_ACTION_ATTR_LB_OUTPUT:
break;
case OVS_ACTION_ATTR_SET_MASKED:
diff --git a/ofproto/ofproto-dpif-xlate.c b/ofproto/ofproto-dpif-xlate.c
index e64c6d477..e0ede2cab 100644
--- a/ofproto/ofproto-dpif-xlate.c
+++ b/ofproto/ofproto-dpif-xlate.c
@@ -4207,7 +4207,17 @@ compose_output_action__(struct xlate_ctx *ctx, ofp_port_t ofp_port,
/* Commit accumulated flow updates before output. */
xlate_commit_actions(ctx);
- if (xr) {
+ if (xr && bond_use_lb_output_action(xport->xbundle->bond)) {
+ /*
+ * If bond mode is balance-tcp and optimize balance tcp is enabled
+ * then use the hash directly for slave selection and avoid
+ * recirculation.
+ *
+ * Currently support for netdev datapath only.
+ */
+ nl_msg_put_u32(ctx->odp_actions, OVS_ACTION_ATTR_LB_OUTPUT,
+ xr->recirc_id);
+ } else if (xr) {
/* Recirculate the packet. */
struct ovs_action_hash *act_hash;
@@ -7310,7 +7320,8 @@ count_output_actions(const struct ofpbuf *odp_actions)
int n = 0;
NL_ATTR_FOR_EACH_UNSAFE (a, left, odp_actions->data, odp_actions->size) {
- if (a->nla_type == OVS_ACTION_ATTR_OUTPUT) {
+ if ((a->nla_type == OVS_ACTION_ATTR_OUTPUT) ||
+ (a->nla_type == OVS_ACTION_ATTR_LB_OUTPUT)) {
n++;
}
}
diff --git a/ofproto/ofproto-dpif.c b/ofproto/ofproto-dpif.c
index 7e10375f2..4f0638f23 100644
--- a/ofproto/ofproto-dpif.c
+++ b/ofproto/ofproto-dpif.c
@@ -868,6 +868,12 @@ ovs_explicit_drop_action_supported(struct ofproto_dpif *ofproto)
return ofproto->backer->rt_support.explicit_drop_action;
}
+bool
+ovs_lb_output_action_supported(struct ofproto_dpif *ofproto)
+{
+ return ofproto->backer->rt_support.lb_output_action;
+}
+
/* Tests whether 'backer''s datapath supports recirculation. Only newer
* datapaths support OVS_KEY_ATTR_RECIRC_ID in keys. We need to disable some
* features on older datapaths that don't support this feature.
@@ -1582,6 +1588,8 @@ check_support(struct dpif_backer *backer)
backer->rt_support.ct_timeout = check_ct_timeout_policy(backer);
backer->rt_support.explicit_drop_action =
dpif_supports_explicit_drop_action(backer->dpif);
+ backer->rt_support.lb_output_action=
+ dpif_supports_lb_output_action(backer->dpif);
/* Flow fields. */
backer->rt_support.odp.ct_state = check_ct_state(backer);
@@ -3441,6 +3449,27 @@ bundle_remove(struct ofport *port_)
}
}
+int
+ofproto_dpif_add_lb_output_buckets(struct ofproto_dpif *ofproto,
+ uint32_t bond_id,
+ const ofp_port_t *slave_map)
+{
+ odp_port_t odp_map[BOND_BUCKETS];
+
+ for (int bucket = 0; bucket < BOND_BUCKETS; bucket++) {
+ /* Convert ofp_port to odp_port. */
+ odp_map[bucket] = ofp_port_to_odp_port(ofproto, slave_map[bucket]);
+ }
+ return dpif_bond_add(ofproto->backer->dpif, bond_id, odp_map);
+}
+
+int
+ofproto_dpif_delete_lb_output_buckets(struct ofproto_dpif *ofproto,
+ uint32_t bond_id)
+{
+ return dpif_bond_del(ofproto->backer->dpif, bond_id);
+}
+
static void
send_pdu_cb(void *port_, const void *pdu, size_t pdu_size)
{
@@ -5572,6 +5601,7 @@ get_datapath_cap(const char *datapath_type, struct smap *cap)
smap_add(cap, "ct_timeout", s.ct_timeout ? "true" : "false");
smap_add(cap, "explicit_drop_action",
s.explicit_drop_action ? "true" :"false");
+ smap_add(cap, "lb_output_action", s.lb_output_action ? "true" : "false");
}
/* Gets timeout policy name in 'backer' based on 'zone', 'dl_type' and
diff --git a/ofproto/ofproto-dpif.h b/ofproto/ofproto-dpif.h
index aee61d61d..4e5ae0c9e 100644
--- a/ofproto/ofproto-dpif.h
+++ b/ofproto/ofproto-dpif.h
@@ -202,7 +202,10 @@ struct group_dpif *group_dpif_lookup(struct ofproto_dpif *,
DPIF_SUPPORT_FIELD(bool, ct_timeout, "Conntrack timeout policy") \
\
/* True if the datapath supports explicit drop action. */ \
- DPIF_SUPPORT_FIELD(bool, explicit_drop_action, "Explicit Drop action")
+ DPIF_SUPPORT_FIELD(bool, explicit_drop_action, "Explicit Drop action") \
+ \
+ /* True if the datapath supports balance_tcp optimization */ \
+ DPIF_SUPPORT_FIELD(bool, lb_output_action, "Optimized Balance TCP mode")
/* Stores the various features which the corresponding backer supports. */
@@ -382,6 +385,11 @@ int ofproto_dpif_add_internal_flow(struct ofproto_dpif *,
struct rule **rulep);
int ofproto_dpif_delete_internal_flow(struct ofproto_dpif *, struct match *,
int priority);
+int ofproto_dpif_add_lb_output_buckets(struct ofproto_dpif *, uint32_t bond_id,
+ const ofp_port_t *slave_map);
+int ofproto_dpif_delete_lb_output_buckets(struct ofproto_dpif *,
+ uint32_t bond_id);
+bool ovs_lb_output_action_supported(struct ofproto_dpif *);
bool ovs_native_tunneling_is_on(struct ofproto_dpif *);