summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorBen Pfaff <blp@ovn.org>2018-01-19 12:42:24 -0800
committerBen Pfaff <blp@ovn.org>2018-01-19 12:42:24 -0800
commit70bbaa46492d25d94154bebd4563d14d3e629068 (patch)
treeecf95cebf3b26961f50f67bc73a1fa47fca9811b /lib
parent2927a4730b1f66078374f6951308196f43a91121 (diff)
parentb2e8b12f8a821905c25295e04b843c0592a44339 (diff)
downloadopenvswitch-70bbaa46492d25d94154bebd4563d14d3e629068.tar.gz
Merge branch 'dpdk_merge' of https://github.com/istokes/ovs into HEAD
Diffstat (limited to 'lib')
-rw-r--r--lib/automake.mk2
-rw-r--r--lib/dpif-netdev-perf.c60
-rw-r--r--lib/dpif-netdev-perf.h232
-rw-r--r--lib/dpif-netdev.c654
-rw-r--r--lib/netdev-dpdk.c63
5 files changed, 686 insertions, 325 deletions
diff --git a/lib/automake.mk b/lib/automake.mk
index 4b38a1163..159319fa0 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -80,6 +80,8 @@ lib_libopenvswitch_la_SOURCES = \
lib/dpdk.h \
lib/dpif-netdev.c \
lib/dpif-netdev.h \
+ lib/dpif-netdev-perf.c \
+ lib/dpif-netdev-perf.h \
lib/dpif-provider.h \
lib/dpif.c \
lib/dpif.h \
diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c
new file mode 100644
index 000000000..f06991ad8
--- /dev/null
+++ b/lib/dpif-netdev-perf.c
@@ -0,0 +1,60 @@
+/*
+ * Copyright (c) 2017 Ericsson AB.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "openvswitch/dynamic-string.h"
+#include "openvswitch/vlog.h"
+#include "dpif-netdev-perf.h"
+#include "timeval.h"
+
+VLOG_DEFINE_THIS_MODULE(pmd_perf);
+
+void
+pmd_perf_stats_init(struct pmd_perf_stats *s)
+{
+ memset(s, 0 , sizeof(*s));
+}
+
+void
+pmd_perf_read_counters(struct pmd_perf_stats *s,
+ uint64_t stats[PMD_N_STATS])
+{
+ uint64_t val;
+
+ /* These loops subtracts reference values (.zero[*]) from the counters.
+ * Since loads and stores are relaxed, it might be possible for a .zero[*]
+ * value to be more recent than the current value we're reading from the
+ * counter. This is not a big problem, since these numbers are not
+ * supposed to be 100% accurate, but we should at least make sure that
+ * the result is not negative. */
+ for (int i = 0; i < PMD_N_STATS; i++) {
+ atomic_read_relaxed(&s->counters.n[i], &val);
+ if (val > s->counters.zero[i]) {
+ stats[i] = val - s->counters.zero[i];
+ } else {
+ stats[i] = 0;
+ }
+ }
+}
+
+void
+pmd_perf_stats_clear(struct pmd_perf_stats *s)
+{
+ for (int i = 0; i < PMD_N_STATS; i++) {
+ atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]);
+ }
+}
diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h
new file mode 100644
index 000000000..5993c25bc
--- /dev/null
+++ b/lib/dpif-netdev-perf.h
@@ -0,0 +1,232 @@
+/*
+ * Copyright (c) 2017 Ericsson AB.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef DPIF_NETDEV_PERF_H
+#define DPIF_NETDEV_PERF_H 1
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <string.h>
+#include <math.h>
+
+#ifdef DPDK_NETDEV
+#include <rte_config.h>
+#include <rte_cycles.h>
+#endif
+
+#include "openvswitch/vlog.h"
+#include "ovs-atomic.h"
+#include "timeval.h"
+#include "unixctl.h"
+#include "util.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* This module encapsulates data structures and functions to maintain PMD
+ * performance metrics such as packet counters, execution cycles. It
+ * provides a clean API for dpif-netdev to initialize, update and read and
+ * reset these metrics.
+ */
+
+/* Set of counter types maintained in pmd_perf_stats. */
+
+enum pmd_stat_type {
+ PMD_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
+ PMD_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
+ PMD_STAT_MISS, /* Packets that did not match and upcall was ok. */
+ PMD_STAT_LOST, /* Packets that did not match and upcall failed. */
+ /* The above statistics account for the total
+ * number of packet passes through the datapath
+ * pipeline and should not be overlapping with each
+ * other. */
+ PMD_STAT_MASKED_LOOKUP, /* Number of subtable lookups for flow table
+ hits. Each MASKED_HIT hit will have >= 1
+ MASKED_LOOKUP(s). */
+ PMD_STAT_RECV, /* Packets entering the datapath pipeline from an
+ * interface. */
+ PMD_STAT_RECIRC, /* Packets reentering the datapath pipeline due to
+ * recirculation. */
+ PMD_STAT_SENT_PKTS, /* Packets that have been sent. */
+ PMD_STAT_SENT_BATCHES, /* Number of batches sent. */
+ PMD_CYCLES_ITER_IDLE, /* Cycles spent in idle iterations. */
+ PMD_CYCLES_ITER_BUSY, /* Cycles spent in busy iterations. */
+ PMD_N_STATS
+};
+
+/* Array of PMD counters indexed by enum pmd_stat_type.
+ * The n[] array contains the actual counter values since initialization
+ * of the PMD. Counters are atomically updated from the PMD but are
+ * read and cleared also from other processes. To clear the counters at
+ * PMD run-time, the current counter values are copied over to the zero[]
+ * array. To read counters we subtract zero[] value from n[]. */
+
+struct pmd_counters {
+ atomic_uint64_t n[PMD_N_STATS]; /* Value since _init(). */
+ uint64_t zero[PMD_N_STATS]; /* Value at last _clear(). */
+};
+
+/* Container for all performance metrics of a PMD.
+ * Part of the struct dp_netdev_pmd_thread. */
+
+struct pmd_perf_stats {
+ /* Start of the current PMD iteration in TSC cycles.*/
+ uint64_t start_it_tsc;
+ /* Latest TSC time stamp taken in PMD. */
+ uint64_t last_tsc;
+ /* If non-NULL, outermost cycle timer currently running in PMD. */
+ struct cycle_timer *cur_timer;
+ /* Set of PMD counters with their zero offsets. */
+ struct pmd_counters counters;
+};
+
+/* Support for accurate timing of PMD execution on TSC clock cycle level.
+ * These functions are intended to be invoked in the context of pmd threads. */
+
+/* Read the TSC cycle register and cache it. Any function not requiring clock
+ * cycle accuracy should read the cached value using cycles_counter_get() to
+ * avoid the overhead of reading the TSC register. */
+
+static inline uint64_t
+cycles_counter_update(struct pmd_perf_stats *s)
+{
+#ifdef DPDK_NETDEV
+ return s->last_tsc = rte_get_tsc_cycles();
+#else
+ return s->last_tsc = 0;
+#endif
+}
+
+static inline uint64_t
+cycles_counter_get(struct pmd_perf_stats *s)
+{
+ return s->last_tsc;
+}
+
+/* A nestable timer for measuring execution time in TSC cycles.
+ *
+ * Usage:
+ * struct cycle_timer timer;
+ *
+ * cycle_timer_start(pmd, &timer);
+ * <Timed execution>
+ * uint64_t cycles = cycle_timer_stop(pmd, &timer);
+ *
+ * The caller must guarantee that a call to cycle_timer_start() is always
+ * paired with a call to cycle_stimer_stop().
+ *
+ * Is is possible to have nested cycles timers within the timed code. The
+ * execution time measured by the nested timers is excluded from the time
+ * measured by the embracing timer.
+ */
+
+struct cycle_timer {
+ uint64_t start;
+ uint64_t suspended;
+ struct cycle_timer *interrupted;
+};
+
+static inline void
+cycle_timer_start(struct pmd_perf_stats *s,
+ struct cycle_timer *timer)
+{
+ struct cycle_timer *cur_timer = s->cur_timer;
+ uint64_t now = cycles_counter_update(s);
+
+ if (cur_timer) {
+ cur_timer->suspended = now;
+ }
+ timer->interrupted = cur_timer;
+ timer->start = now;
+ timer->suspended = 0;
+ s->cur_timer = timer;
+}
+
+static inline uint64_t
+cycle_timer_stop(struct pmd_perf_stats *s,
+ struct cycle_timer *timer)
+{
+ /* Assert that this is the current cycle timer. */
+ ovs_assert(s->cur_timer == timer);
+ uint64_t now = cycles_counter_update(s);
+ struct cycle_timer *intr_timer = timer->interrupted;
+
+ if (intr_timer) {
+ /* Adjust the start offset by the suspended cycles. */
+ intr_timer->start += now - intr_timer->suspended;
+ }
+ /* Restore suspended timer, if any. */
+ s->cur_timer = intr_timer;
+ return now - timer->start;
+}
+
+void pmd_perf_stats_init(struct pmd_perf_stats *s);
+void pmd_perf_stats_clear(struct pmd_perf_stats *s);
+void pmd_perf_read_counters(struct pmd_perf_stats *s,
+ uint64_t stats[PMD_N_STATS]);
+
+/* PMD performance counters are updated lock-less. For real PMDs
+ * they are only updated from the PMD thread itself. In the case of the
+ * NON-PMD they might be updated from multiple threads, but we can live
+ * with losing a rare update as 100% accuracy is not required.
+ * However, as counters are read for display from outside the PMD thread
+ * with e.g. pmd-stats-show, we make sure that the 64-bit read and store
+ * operations are atomic also on 32-bit systems so that readers cannot
+ * not read garbage. On 64-bit systems this incurs no overhead. */
+
+static inline void
+pmd_perf_update_counter(struct pmd_perf_stats *s,
+ enum pmd_stat_type counter, int delta)
+{
+ uint64_t tmp;
+ atomic_read_relaxed(&s->counters.n[counter], &tmp);
+ tmp += delta;
+ atomic_store_relaxed(&s->counters.n[counter], tmp);
+}
+
+static inline void
+pmd_perf_start_iteration(struct pmd_perf_stats *s)
+{
+ if (OVS_LIKELY(s->last_tsc)) {
+ /* We assume here that last_tsc was updated immediately prior at
+ * the end of the previous iteration, or just before the first
+ * iteration. */
+ s->start_it_tsc = s->last_tsc;
+ } else {
+ /* In case last_tsc has never been set before. */
+ s->start_it_tsc = cycles_counter_update(s);
+ }
+}
+
+static inline void
+pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets)
+{
+ uint64_t cycles = cycles_counter_update(s) - s->start_it_tsc;
+
+ if (rx_packets > 0) {
+ pmd_perf_update_counter(s, PMD_CYCLES_ITER_BUSY, cycles);
+ } else {
+ pmd_perf_update_counter(s, PMD_CYCLES_ITER_IDLE, cycles);
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* DPIF_NETDEV_PERF_H */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index c7d157ab6..f94b189e0 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -32,10 +32,6 @@
#include <sys/stat.h>
#include <unistd.h>
-#ifdef DPDK_NETDEV
-#include <rte_cycles.h>
-#endif
-
#include "bitmap.h"
#include "cmap.h"
#include "conntrack.h"
@@ -44,6 +40,7 @@
#include "csum.h"
#include "dp-packet.h"
#include "dpif.h"
+#include "dpif-netdev-perf.h"
#include "dpif-provider.h"
#include "dummy.h"
#include "fat-rwlock.h"
@@ -86,6 +83,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev);
#define MAX_RECIRC_DEPTH 6
DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0)
+/* Use instant packet send by default. */
+#define DEFAULT_TX_FLUSH_INTERVAL 0
+
/* Configuration parameters. */
enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */
enum { MAX_METERS = 65536 }; /* Maximum number of meters. */
@@ -179,12 +179,13 @@ struct emc_cache {
/* Simple non-wildcarding single-priority classifier. */
-/* Time in ms between successive optimizations of the dpcls subtable vector */
-#define DPCLS_OPTIMIZATION_INTERVAL 1000
+/* Time in microseconds between successive optimizations of the dpcls
+ * subtable vector */
+#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL
-/* Time in ms of the interval in which rxq processing cycles used in
- * rxq to pmd assignments is measured and stored. */
-#define PMD_RXQ_INTERVAL_LEN 10000
+/* Time in microseconds of the interval in which rxq processing cycles used
+ * in rxq to pmd assignments is measured and stored. */
+#define PMD_RXQ_INTERVAL_LEN 10000000LL
/* Number of intervals for which cycles are stored
* and used during rxq to pmd assignment. */
@@ -271,6 +272,9 @@ struct dp_netdev {
struct hmap ports;
struct seq *port_seq; /* Incremented whenever a port changes. */
+ /* The time that a packet can wait in output batch for sending. */
+ atomic_uint32_t tx_flush_interval;
+
/* Meters. */
struct ovs_mutex meter_locks[N_METER_LOCKS];
struct dp_meter *meters[MAX_METERS]; /* Meter bands. */
@@ -331,25 +335,6 @@ static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp,
odp_port_t)
OVS_REQUIRES(dp->port_mutex);
-enum dp_stat_type {
- DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */
- DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */
- DP_STAT_MISS, /* Packets that did not match. */
- DP_STAT_LOST, /* Packets not passed up to the client. */
- DP_STAT_LOOKUP_HIT, /* Number of subtable lookups for flow table
- hits */
- DP_STAT_SENT_PKTS, /* Packets that has been sent. */
- DP_STAT_SENT_BATCHES, /* Number of batches sent. */
- DP_N_STATS
-};
-
-enum pmd_cycles_counter_type {
- PMD_CYCLES_IDLE, /* Cycles spent idle or unsuccessful polling */
- PMD_CYCLES_PROCESSING, /* Cycles spent successfully polling and
- * processing polled packets */
- PMD_N_CYCLES
-};
-
enum rxq_cycles_counter_type {
RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and
processing packets during the current
@@ -359,7 +344,7 @@ enum rxq_cycles_counter_type {
RXQ_N_CYCLES
};
-#define XPS_TIMEOUT_MS 500LL
+#define XPS_TIMEOUT 500000LL /* In microseconds. */
/* Contained by struct dp_netdev_port's 'rxqs' member. */
struct dp_netdev_rxq {
@@ -499,21 +484,6 @@ struct dp_netdev_actions *dp_netdev_flow_get_actions(
const struct dp_netdev_flow *);
static void dp_netdev_actions_free(struct dp_netdev_actions *);
-/* Contained by struct dp_netdev_pmd_thread's 'stats' member. */
-struct dp_netdev_pmd_stats {
- /* Indexed by DP_STAT_*. */
- atomic_ullong n[DP_N_STATS];
-};
-
-/* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */
-struct dp_netdev_pmd_cycles {
- /* Indexed by PMD_CYCLES_*. */
- atomic_ullong n[PMD_N_CYCLES];
-};
-
-static void dp_netdev_count_packet(struct dp_netdev_pmd_thread *,
- enum dp_stat_type type, int cnt);
-
struct polled_queue {
struct dp_netdev_rxq *rxq;
odp_port_t port_no;
@@ -532,7 +502,9 @@ struct tx_port {
int qid;
long long last_used;
struct hmap_node node;
+ long long flush_time;
struct dp_packet_batch output_pkts;
+ struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST];
};
/* A set of properties for the current processing loop that is not directly
@@ -542,8 +514,8 @@ struct tx_port {
struct dp_netdev_pmd_thread_ctx {
/* Latest measured time. See 'pmd_thread_ctx_time_update()'. */
long long now;
- /* Used to count cycles. See 'cycles_count_end()' */
- unsigned long long last_cycles;
+ /* RX queue from which last packet was received. */
+ struct dp_netdev_rxq *last_rxq;
};
/* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate
@@ -595,11 +567,10 @@ struct dp_netdev_pmd_thread {
are stored for each polled rxq. */
long long int rxq_next_cycle_store;
- /* Statistics. */
- struct dp_netdev_pmd_stats stats;
-
- /* Cycles counters */
- struct dp_netdev_pmd_cycles cycles;
+ /* Last interval timestamp. */
+ uint64_t intrvl_tsc_prev;
+ /* Last interval cycles. */
+ atomic_ullong intrvl_cycles;
/* Current context of the PMD thread. */
struct dp_netdev_pmd_thread_ctx ctx;
@@ -618,6 +589,9 @@ struct dp_netdev_pmd_thread {
* than 'cmap_count(dp->poll_threads)'. */
uint32_t static_tx_qid;
+ /* Number of filled output batches. */
+ int n_output_batches;
+
struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
/* List of rx queues to poll. */
struct hmap poll_list OVS_GUARDED;
@@ -638,12 +612,8 @@ struct dp_netdev_pmd_thread {
struct hmap tnl_port_cache;
struct hmap send_port_cache;
- /* Only a pmd thread can write on its own 'cycles' and 'stats'.
- * The main thread keeps 'stats_zero' and 'cycles_zero' as base
- * values and subtracts them from 'stats' and 'cycles' before
- * reporting to the user */
- unsigned long long stats_zero[DP_N_STATS];
- uint64_t cycles_zero[PMD_N_CYCLES];
+ /* Keep track of detailed PMD performance statistics. */
+ struct pmd_perf_stats perf_stats;
/* Set to true if the pmd thread needs to be reloaded. */
bool need_reload;
@@ -711,8 +681,9 @@ static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd,
static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd,
struct rxq_poll *poll)
OVS_REQUIRES(pmd->port_mutex);
-static void
-dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd);
+static int
+dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
+ bool force);
static void reconfigure_datapath(struct dp_netdev *dp)
OVS_REQUIRES(dp->port_mutex);
@@ -803,7 +774,7 @@ emc_cache_slow_sweep(struct emc_cache *flow_cache)
static inline void
pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd)
{
- pmd->ctx.now = time_msec();
+ pmd->ctx.now = time_usec();
}
/* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */
@@ -833,47 +804,10 @@ enum pmd_info_type {
};
static void
-pmd_info_show_stats(struct ds *reply,
- struct dp_netdev_pmd_thread *pmd,
- unsigned long long stats[DP_N_STATS],
- uint64_t cycles[PMD_N_CYCLES])
+format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
{
- unsigned long long total_packets;
- uint64_t total_cycles = 0;
- double lookups_per_hit = 0, packets_per_batch = 0;
- int i;
-
- /* These loops subtracts reference values ('*_zero') from the counters.
- * Since loads and stores are relaxed, it might be possible for a '*_zero'
- * value to be more recent than the current value we're reading from the
- * counter. This is not a big problem, since these numbers are not
- * supposed to be too accurate, but we should at least make sure that
- * the result is not negative. */
- for (i = 0; i < DP_N_STATS; i++) {
- if (stats[i] > pmd->stats_zero[i]) {
- stats[i] -= pmd->stats_zero[i];
- } else {
- stats[i] = 0;
- }
- }
-
- /* Sum of all the matched and not matched packets gives the total. */
- total_packets = stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT]
- + stats[DP_STAT_MISS];
-
- for (i = 0; i < PMD_N_CYCLES; i++) {
- if (cycles[i] > pmd->cycles_zero[i]) {
- cycles[i] -= pmd->cycles_zero[i];
- } else {
- cycles[i] = 0;
- }
-
- total_cycles += cycles[i];
- }
-
ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID)
? "main thread" : "pmd thread");
-
if (pmd->numa_id != OVS_NUMA_UNSPEC) {
ds_put_format(reply, " numa_id %d", pmd->numa_id);
}
@@ -881,23 +815,52 @@ pmd_info_show_stats(struct ds *reply,
ds_put_format(reply, " core_id %u", pmd->core_id);
}
ds_put_cstr(reply, ":\n");
+}
+
+static void
+pmd_info_show_stats(struct ds *reply,
+ struct dp_netdev_pmd_thread *pmd)
+{
+ uint64_t stats[PMD_N_STATS];
+ uint64_t total_cycles, total_packets;
+ double passes_per_pkt = 0;
+ double lookups_per_hit = 0;
+ double packets_per_batch = 0;
+
+ pmd_perf_read_counters(&pmd->perf_stats, stats);
+ total_cycles = stats[PMD_CYCLES_ITER_IDLE]
+ + stats[PMD_CYCLES_ITER_BUSY];
+ total_packets = stats[PMD_STAT_RECV];
- if (stats[DP_STAT_MASKED_HIT] > 0) {
- lookups_per_hit = stats[DP_STAT_LOOKUP_HIT]
- / (double) stats[DP_STAT_MASKED_HIT];
+ format_pmd_thread(reply, pmd);
+
+ if (total_packets > 0) {
+ passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC])
+ / (double) total_packets;
+ }
+ if (stats[PMD_STAT_MASKED_HIT] > 0) {
+ lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP]
+ / (double) stats[PMD_STAT_MASKED_HIT];
}
- if (stats[DP_STAT_SENT_BATCHES] > 0) {
- packets_per_batch = stats[DP_STAT_SENT_PKTS]
- / (double) stats[DP_STAT_SENT_BATCHES];
+ if (stats[PMD_STAT_SENT_BATCHES] > 0) {
+ packets_per_batch = stats[PMD_STAT_SENT_PKTS]
+ / (double) stats[PMD_STAT_SENT_BATCHES];
}
ds_put_format(reply,
- "\temc hits:%llu\n\tmegaflow hits:%llu\n"
- "\tavg. subtable lookups per hit:%.2f\n"
- "\tmiss:%llu\n\tlost:%llu\n"
- "\tavg. packets per output batch: %.2f\n",
- stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT],
- lookups_per_hit, stats[DP_STAT_MISS], stats[DP_STAT_LOST],
+ "\tpackets received: %"PRIu64"\n"
+ "\tpacket recirculations: %"PRIu64"\n"
+ "\tavg. datapath passes per packet: %.02f\n"
+ "\temc hits: %"PRIu64"\n"
+ "\tmegaflow hits: %"PRIu64"\n"
+ "\tavg. subtable lookups per megaflow hit: %.02f\n"
+ "\tmiss with success upcall: %"PRIu64"\n"
+ "\tmiss with failed upcall: %"PRIu64"\n"
+ "\tavg. packets per output batch: %.02f\n",
+ total_packets, stats[PMD_STAT_RECIRC],
+ passes_per_pkt, stats[PMD_STAT_EXACT_HIT],
+ stats[PMD_STAT_MASKED_HIT], lookups_per_hit,
+ stats[PMD_STAT_MISS], stats[PMD_STAT_LOST],
packets_per_batch);
if (total_cycles == 0) {
@@ -905,48 +868,27 @@ pmd_info_show_stats(struct ds *reply,
}
ds_put_format(reply,
- "\tidle cycles:%"PRIu64" (%.02f%%)\n"
- "\tprocessing cycles:%"PRIu64" (%.02f%%)\n",
- cycles[PMD_CYCLES_IDLE],
- cycles[PMD_CYCLES_IDLE] / (double)total_cycles * 100,
- cycles[PMD_CYCLES_PROCESSING],
- cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100);
+ "\tidle cycles: %"PRIu64" (%.02f%%)\n"
+ "\tprocessing cycles: %"PRIu64" (%.02f%%)\n",
+ stats[PMD_CYCLES_ITER_IDLE],
+ stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100,
+ stats[PMD_CYCLES_ITER_BUSY],
+ stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100);
if (total_packets == 0) {
return;
}
ds_put_format(reply,
- "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n",
- total_cycles / (double)total_packets,
+ "\tavg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n",
+ total_cycles / (double) total_packets,
total_cycles, total_packets);
ds_put_format(reply,
"\tavg processing cycles per packet: "
- "%.02f (%"PRIu64"/%llu)\n",
- cycles[PMD_CYCLES_PROCESSING] / (double)total_packets,
- cycles[PMD_CYCLES_PROCESSING], total_packets);
-}
-
-static void
-pmd_info_clear_stats(struct ds *reply OVS_UNUSED,
- struct dp_netdev_pmd_thread *pmd,
- unsigned long long stats[DP_N_STATS],
- uint64_t cycles[PMD_N_CYCLES])
-{
- int i;
-
- /* We cannot write 'stats' and 'cycles' (because they're written by other
- * threads) and we shouldn't change 'stats' (because they're used to count
- * datapath stats, which must not be cleared here). Instead, we save the
- * current values and subtract them from the values to be displayed in the
- * future */
- for (i = 0; i < DP_N_STATS; i++) {
- pmd->stats_zero[i] = stats[i];
- }
- for (i = 0; i < PMD_N_CYCLES; i++) {
- pmd->cycles_zero[i] = cycles[i];
- }
+ "%.02f (%"PRIu64"/%"PRIu64")\n",
+ stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets,
+ stats[PMD_CYCLES_ITER_BUSY], total_packets);
}
static int
@@ -995,9 +937,9 @@ static void
pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
{
if (pmd->core_id != NON_PMD_CORE_ID) {
- const char *prev_name = NULL;
struct rxq_poll *list;
- size_t i, n;
+ size_t n_rxq;
+ uint64_t total_cycles = 0;
ds_put_format(reply,
"pmd thread numa_id %d core_id %u:\n\tisolated : %s\n",
@@ -1005,22 +947,34 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd)
? "true" : "false");
ovs_mutex_lock(&pmd->port_mutex);
- sorted_poll_list(pmd, &list, &n);
- for (i = 0; i < n; i++) {
- const char *name = netdev_rxq_get_name(list[i].rxq->rx);
+ sorted_poll_list(pmd, &list, &n_rxq);
- if (!prev_name || strcmp(name, prev_name)) {
- if (prev_name) {
- ds_put_cstr(reply, "\n");
- }
- ds_put_format(reply, "\tport: %s\tqueue-id:", name);
+ /* Get the total pmd cycles for an interval. */
+ atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles);
+ /* Estimate the cycles to cover all intervals. */
+ total_cycles *= PMD_RXQ_INTERVAL_MAX;
+
+ for (int i = 0; i < n_rxq; i++) {
+ struct dp_netdev_rxq *rxq = list[i].rxq;
+ const char *name = netdev_rxq_get_name(rxq->rx);
+ uint64_t proc_cycles = 0;
+
+ for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) {
+ proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j);
}
- ds_put_format(reply, " %d",
+ ds_put_format(reply, "\tport: %-16s\tqueue-id: %2d", name,
netdev_rxq_get_queue_id(list[i].rxq->rx));
- prev_name = name;
+ ds_put_format(reply, "\tpmd usage: ");
+ if (total_cycles) {
+ ds_put_format(reply, "%2"PRIu64"",
+ proc_cycles * 100 / total_cycles);
+ ds_put_cstr(reply, " %");
+ } else {
+ ds_put_format(reply, "%s", "NOT AVAIL");
+ }
+ ds_put_cstr(reply, "\n");
}
ovs_mutex_unlock(&pmd->port_mutex);
- ds_put_cstr(reply, "\n");
free(list);
}
}
@@ -1106,23 +1060,37 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
struct ds reply = DS_EMPTY_INITIALIZER;
struct dp_netdev_pmd_thread **pmd_list;
struct dp_netdev *dp = NULL;
- size_t n;
enum pmd_info_type type = *(enum pmd_info_type *) aux;
+ unsigned int core_id;
+ bool filter_on_pmd = false;
+ size_t n;
ovs_mutex_lock(&dp_netdev_mutex);
- if (argc == 2) {
- dp = shash_find_data(&dp_netdevs, argv[1]);
- } else if (shash_count(&dp_netdevs) == 1) {
- /* There's only one datapath */
- dp = shash_first(&dp_netdevs)->data;
+ while (argc > 1) {
+ if (!strcmp(argv[1], "-pmd") && argc >= 3) {
+ if (str_to_uint(argv[2], 10, &core_id)) {
+ filter_on_pmd = true;
+ }
+ argc -= 2;
+ argv += 2;
+ } else {
+ dp = shash_find_data(&dp_netdevs, argv[1]);
+ argc -= 1;
+ argv += 1;
+ }
}
if (!dp) {
- ovs_mutex_unlock(&dp_netdev_mutex);
- unixctl_command_reply_error(conn,
- "please specify an existing datapath");
- return;
+ if (shash_count(&dp_netdevs) == 1) {
+ /* There's only one datapath */
+ dp = shash_first(&dp_netdevs)->data;
+ } else {
+ ovs_mutex_unlock(&dp_netdev_mutex);
+ unixctl_command_reply_error(conn,
+ "please specify an existing datapath");
+ return;
+ }
}
sorted_poll_thread_list(dp, &pmd_list, &n);
@@ -1131,26 +1099,15 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[],
if (!pmd) {
break;
}
-
+ if (filter_on_pmd && pmd->core_id != core_id) {
+ continue;
+ }
if (type == PMD_INFO_SHOW_RXQ) {
pmd_info_show_rxq(&reply, pmd);
- } else {
- unsigned long long stats[DP_N_STATS];
- uint64_t cycles[PMD_N_CYCLES];
-
- /* Read current stats and cycle counters */
- for (size_t j = 0; j < ARRAY_SIZE(stats); j++) {
- atomic_read_relaxed(&pmd->stats.n[j], &stats[j]);
- }
- for (size_t j = 0; j < ARRAY_SIZE(cycles); j++) {
- atomic_read_relaxed(&pmd->cycles.n[j], &cycles[j]);
- }
-
- if (type == PMD_INFO_CLEAR_STATS) {
- pmd_info_clear_stats(&reply, pmd, stats, cycles);
- } else if (type == PMD_INFO_SHOW_STATS) {
- pmd_info_show_stats(&reply, pmd, stats, cycles);
- }
+ } else if (type == PMD_INFO_CLEAR_STATS) {
+ pmd_perf_stats_clear(&pmd->perf_stats);
+ } else if (type == PMD_INFO_SHOW_STATS) {
+ pmd_info_show_stats(&reply, pmd);
}
}
free(pmd_list);
@@ -1168,14 +1125,14 @@ dpif_netdev_init(void)
clear_aux = PMD_INFO_CLEAR_STATS,
poll_aux = PMD_INFO_SHOW_RXQ;
- unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]",
- 0, 1, dpif_netdev_pmd_info,
+ unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]",
+ 0, 3, dpif_netdev_pmd_info,
(void *)&show_aux);
- unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]",
- 0, 1, dpif_netdev_pmd_info,
+ unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]",
+ 0, 3, dpif_netdev_pmd_info,
(void *)&clear_aux);
- unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]",
- 0, 1, dpif_netdev_pmd_info,
+ unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]",
+ 0, 3, dpif_netdev_pmd_info,
(void *)&poll_aux);
unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]",
0, 1, dpif_netdev_pmd_rebalance,
@@ -1312,6 +1269,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class,
conntrack_init(&dp->conntrack);
atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN);
+ atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL);
cmap_init(&dp->poll_threads);
@@ -1511,20 +1469,16 @@ dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats)
{
struct dp_netdev *dp = get_dp_netdev(dpif);
struct dp_netdev_pmd_thread *pmd;
+ uint64_t pmd_stats[PMD_N_STATS];
stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0;
CMAP_FOR_EACH (pmd, node, &dp->poll_threads) {
- unsigned long long n;
stats->n_flows += cmap_count(&pmd->flow_table);
-
- atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n);
- stats->n_hit += n;
- atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n);
- stats->n_hit += n;
- atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n);
- stats->n_missed += n;
- atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n);
- stats->n_lost += n;
+ pmd_perf_read_counters(&pmd->perf_stats, pmd_stats);
+ stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT];
+ stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT];
+ stats->n_missed += pmd_stats[PMD_STAT_MISS];
+ stats->n_lost += pmd_stats[PMD_STAT_LOST];
}
stats->n_masks = UINT32_MAX;
stats->n_mask_hit = UINT64_MAX;
@@ -2982,7 +2936,7 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
dp_packet_batch_init_packet(&pp, execute->packet);
dp_netdev_execute_actions(pmd, &pp, false, execute->flow,
execute->actions, execute->actions_len);
- dp_netdev_pmd_flush_output_packets(pmd);
+ dp_netdev_pmd_flush_output_packets(pmd, true);
if (pmd->core_id == NON_PMD_CORE_ID) {
ovs_mutex_unlock(&dp->non_pmd_mutex);
@@ -3031,6 +2985,16 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config)
smap_get_ullong(other_config, "emc-insert-inv-prob",
DEFAULT_EM_FLOW_INSERT_INV_PROB);
uint32_t insert_min, cur_min;
+ uint32_t tx_flush_interval, cur_tx_flush_interval;
+
+ tx_flush_interval = smap_get_int(other_config, "tx-flush-interval",
+ DEFAULT_TX_FLUSH_INTERVAL);
+ atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval);
+ if (tx_flush_interval != cur_tx_flush_interval) {
+ atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval);
+ VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us",
+ tx_flush_interval);
+ }
if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) {
free(dp->pmd_cmask);
@@ -3184,64 +3148,20 @@ dp_netdev_actions_free(struct dp_netdev_actions *actions)
free(actions);
}
-static inline unsigned long long
-cycles_counter(void)
-{
-#ifdef DPDK_NETDEV
- return rte_get_tsc_cycles();
-#else
- return 0;
-#endif
-}
-
-/* Fake mutex to make sure that the calls to cycles_count_* are balanced */
-extern struct ovs_mutex cycles_counter_fake_mutex;
-
-/* Start counting cycles. Must be followed by 'cycles_count_end()' */
-static inline void
-cycles_count_start(struct dp_netdev_pmd_thread *pmd)
- OVS_ACQUIRES(&cycles_counter_fake_mutex)
- OVS_NO_THREAD_SAFETY_ANALYSIS
-{
- pmd->ctx.last_cycles = cycles_counter();
-}
-
-/* Stop counting cycles and add them to the counter 'type' */
-static inline void
-cycles_count_end(struct dp_netdev_pmd_thread *pmd,
- enum pmd_cycles_counter_type type)
- OVS_RELEASES(&cycles_counter_fake_mutex)
- OVS_NO_THREAD_SAFETY_ANALYSIS
-{
- unsigned long long interval = cycles_counter() - pmd->ctx.last_cycles;
-
- non_atomic_ullong_add(&pmd->cycles.n[type], interval);
-}
-
-/* Calculate the intermediate cycle result and add to the counter 'type' */
-static inline void
-cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd,
- struct dp_netdev_rxq *rxq,
- enum pmd_cycles_counter_type type)
- OVS_NO_THREAD_SAFETY_ANALYSIS
+static void
+dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
+ enum rxq_cycles_counter_type type,
+ unsigned long long cycles)
{
- unsigned long long new_cycles = cycles_counter();
- unsigned long long interval = new_cycles - pmd->ctx.last_cycles;
- pmd->ctx.last_cycles = new_cycles;
-
- non_atomic_ullong_add(&pmd->cycles.n[type], interval);
- if (rxq && (type == PMD_CYCLES_PROCESSING)) {
- /* Add to the amount of current processing cycles. */
- non_atomic_ullong_add(&rxq->cycles[RXQ_CYCLES_PROC_CURR], interval);
- }
+ atomic_store_relaxed(&rx->cycles[type], cycles);
}
static void
-dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx,
+dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx,
enum rxq_cycles_counter_type type,
unsigned long long cycles)
{
- atomic_store_relaxed(&rx->cycles[type], cycles);
+ non_atomic_ullong_add(&rx->cycles[type], cycles);
}
static uint64_t
@@ -3269,13 +3189,19 @@ dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx)
return processing_cycles;
}
-static void
+static int
dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
struct tx_port *p)
{
+ int i;
int tx_qid;
int output_cnt;
bool dynamic_txqs;
+ struct cycle_timer timer;
+ uint64_t cycles;
+ uint32_t tx_flush_interval;
+
+ cycle_timer_start(&pmd->perf_stats, &timer);
dynamic_txqs = p->port->dynamic_txqs;
if (dynamic_txqs) {
@@ -3285,52 +3211,99 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd,
}
output_cnt = dp_packet_batch_size(&p->output_pkts);
+ ovs_assert(output_cnt > 0);
netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs);
dp_packet_batch_init(&p->output_pkts);
- dp_netdev_count_packet(pmd, DP_STAT_SENT_PKTS, output_cnt);
- dp_netdev_count_packet(pmd, DP_STAT_SENT_BATCHES, 1);
+ /* Update time of the next flush. */
+ atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval);
+ p->flush_time = pmd->ctx.now + tx_flush_interval;
+
+ ovs_assert(pmd->n_output_batches > 0);
+ pmd->n_output_batches--;
+
+ pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt);
+ pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1);
+
+ /* Distribute send cycles evenly among transmitted packets and assign to
+ * their respective rx queues. */
+ cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt;
+ for (i = 0; i < output_cnt; i++) {
+ if (p->output_pkts_rxqs[i]) {
+ dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i],
+ RXQ_CYCLES_PROC_CURR, cycles);
+ }
+ }
+
+ return output_cnt;
}
-static void
-dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd)
+static int
+dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd,
+ bool force)
{
struct tx_port *p;
+ int output_cnt = 0;
+
+ if (!pmd->n_output_batches) {
+ return 0;
+ }
HMAP_FOR_EACH (p, node, &pmd->send_port_cache) {
- if (!dp_packet_batch_is_empty(&p->output_pkts)) {
- dp_netdev_pmd_flush_output_on_port(pmd, p);
+ if (!dp_packet_batch_is_empty(&p->output_pkts)
+ && (force || pmd->ctx.now >= p->flush_time)) {
+ output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p);
}
}
+ return output_cnt;
}
static int
dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd,
- struct netdev_rxq *rx,
+ struct dp_netdev_rxq *rxq,
odp_port_t port_no)
{
struct dp_packet_batch batch;
+ struct cycle_timer timer;
int error;
- int batch_cnt = 0;
+ int batch_cnt = 0, output_cnt = 0;
+ uint64_t cycles;
+
+ /* Measure duration for polling and processing rx burst. */
+ cycle_timer_start(&pmd->perf_stats, &timer);
+ pmd->ctx.last_rxq = rxq;
dp_packet_batch_init(&batch);
- error = netdev_rxq_recv(rx, &batch);
+
+ error = netdev_rxq_recv(rxq->rx, &batch);
if (!error) {
+ /* At least one packet received. */
*recirc_depth_get() = 0;
pmd_thread_ctx_time_update(pmd);
batch_cnt = batch.count;
dp_netdev_input(pmd, &batch, port_no);
- dp_netdev_pmd_flush_output_packets(pmd);
- } else if (error != EAGAIN && error != EOPNOTSUPP) {
- static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
- VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
- netdev_rxq_get_name(rx), ovs_strerror(error));
+ /* Assign processing cycles to rx queue. */
+ cycles = cycle_timer_stop(&pmd->perf_stats, &timer);
+ dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles);
+
+ output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false);
+ } else {
+ /* Discard cycles. */
+ cycle_timer_stop(&pmd->perf_stats, &timer);
+ if (error != EAGAIN && error != EOPNOTSUPP) {
+ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5);
+
+ VLOG_ERR_RL(&rl, "error receiving data from %s: %s",
+ netdev_rxq_get_name(rxq->rx), ovs_strerror(error));
+ }
}
- return batch_cnt;
+ pmd->ctx.last_rxq = NULL;
+
+ return batch_cnt + output_cnt;
}
static struct tx_port *
@@ -3953,31 +3926,33 @@ dpif_netdev_run(struct dpif *dpif)
struct dp_netdev *dp = get_dp_netdev(dpif);
struct dp_netdev_pmd_thread *non_pmd;
uint64_t new_tnl_seq;
- int process_packets = 0;
+ bool need_to_flush = true;
ovs_mutex_lock(&dp->port_mutex);
non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID);
if (non_pmd) {
ovs_mutex_lock(&dp->non_pmd_mutex);
- cycles_count_start(non_pmd);
HMAP_FOR_EACH (port, node, &dp->ports) {
if (!netdev_is_pmd(port->netdev)) {
int i;
for (i = 0; i < port->n_rxq; i++) {
- process_packets =
- dp_netdev_process_rxq_port(non_pmd,
- port->rxqs[i].rx,
- port->port_no);
- cycles_count_intermediate(non_pmd, NULL,
- process_packets
- ? PMD_CYCLES_PROCESSING
- : PMD_CYCLES_IDLE);
+ if (dp_netdev_process_rxq_port(non_pmd,
+ &port->rxqs[i],
+ port->port_no)) {
+ need_to_flush = false;
+ }
}
}
}
- cycles_count_end(non_pmd, PMD_CYCLES_IDLE);
- pmd_thread_ctx_time_update(non_pmd);
+ if (need_to_flush) {
+ /* We didn't receive anything in the process loop.
+ * Check if we need to send something.
+ * There was no time updates on current iteration. */
+ pmd_thread_ctx_time_update(non_pmd);
+ dp_netdev_pmd_flush_output_packets(non_pmd, false);
+ }
+
dpif_netdev_xps_revalidate_pmd(non_pmd, false);
ovs_mutex_unlock(&dp->non_pmd_mutex);
@@ -4028,6 +4003,8 @@ pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd)
{
struct tx_port *tx_port_cached;
+ /* Flush all the queued packets. */
+ dp_netdev_pmd_flush_output_packets(pmd, true);
/* Free all used tx queue ids. */
dpif_netdev_xps_revalidate_pmd(pmd, true);
@@ -4121,6 +4098,7 @@ static void *
pmd_thread_main(void *f_)
{
struct dp_netdev_pmd_thread *pmd = f_;
+ struct pmd_perf_stats *s = &pmd->perf_stats;
unsigned int lc = 0;
struct polled_queue *poll_list;
bool exiting;
@@ -4144,6 +4122,8 @@ reload:
VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n",
pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx),
netdev_rxq_get_queue_id(poll_list[i].rxq->rx));
+ /* Reset the rxq current cycles counter. */
+ dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0);
}
if (!poll_cnt) {
@@ -4154,15 +4134,26 @@ reload:
lc = UINT_MAX;
}
- cycles_count_start(pmd);
+ pmd->intrvl_tsc_prev = 0;
+ atomic_store_relaxed(&pmd->intrvl_cycles, 0);
+ cycles_counter_update(s);
for (;;) {
+ uint64_t iter_packets = 0;
+
+ pmd_perf_start_iteration(s);
for (i = 0; i < poll_cnt; i++) {
process_packets =
- dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx,
+ dp_netdev_process_rxq_port(pmd, poll_list[i].rxq,
poll_list[i].port_no);
- cycles_count_intermediate(pmd, poll_list[i].rxq,
- process_packets ? PMD_CYCLES_PROCESSING
- : PMD_CYCLES_IDLE);
+ iter_packets += process_packets;
+ }
+
+ if (!iter_packets) {
+ /* We didn't receive anything in the process loop.
+ * Check if we need to send something.
+ * There was no time updates on current iteration. */
+ pmd_thread_ctx_time_update(pmd);
+ iter_packets += dp_netdev_pmd_flush_output_packets(pmd, false);
}
if (lc++ > 1024) {
@@ -4171,9 +4162,6 @@ reload:
lc = 0;
coverage_try_clear();
- /* It's possible that the time was not updated on current
- * iteration, if there were no received packets. */
- pmd_thread_ctx_time_update(pmd);
dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt);
if (!ovsrcu_try_quiesce()) {
emc_cache_slow_sweep(&pmd->flow_cache);
@@ -4184,10 +4172,9 @@ reload:
break;
}
}
+ pmd_perf_end_iteration(s, iter_packets);
}
- cycles_count_end(pmd, PMD_CYCLES_IDLE);
-
poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list);
exiting = latch_is_set(&pmd->exit_latch);
/* Signal here to make sure the pmd finishes
@@ -4259,7 +4246,7 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_,
memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate);
/* All packets will hit the meter at the same time. */
- long_delta_t = (now - meter->used); /* msec */
+ long_delta_t = (now - meter->used) / 1000; /* msec */
/* Make sure delta_t will not be too large, so that bucket will not
* wrap around below. */
@@ -4415,7 +4402,7 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id *meter_id,
meter->flags = config->flags;
meter->n_bands = config->n_bands;
meter->max_delta_t = 0;
- meter->used = time_msec();
+ meter->used = time_usec();
/* set up bands */
for (i = 0; i < config->n_bands; ++i) {
@@ -4613,6 +4600,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
pmd->core_id = core_id;
pmd->numa_id = numa_id;
pmd->need_reload = false;
+ pmd->n_output_batches = 0;
ovs_refcount_init(&pmd->ref_cnt);
latch_init(&pmd->exit_latch);
@@ -4625,6 +4613,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
ovs_mutex_init(&pmd->port_mutex);
cmap_init(&pmd->flow_table);
cmap_init(&pmd->classifiers);
+ pmd->ctx.last_rxq = NULL;
pmd_thread_ctx_time_update(pmd);
pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL;
pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
@@ -4638,6 +4627,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp,
emc_cache_init(&pmd->flow_cache);
pmd_alloc_static_tx_qid(pmd);
}
+ pmd_perf_stats_init(&pmd->perf_stats);
cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node),
hash_int(core_id, 0));
}
@@ -4800,6 +4790,7 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd,
tx->port = port;
tx->qid = -1;
+ tx->flush_time = 0LL;
dp_packet_batch_init(&tx->output_pkts);
hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no));
@@ -4838,13 +4829,6 @@ dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size,
atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags);
}
-static void
-dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd,
- enum dp_stat_type type, int cnt)
-{
- non_atomic_ullong_add(&pmd->stats.n[type], cnt);
-}
-
static int
dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_,
struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid,
@@ -4963,7 +4947,7 @@ packet_batch_per_flow_execute(struct packet_batch_per_flow *batch,
struct dp_netdev_flow *flow = batch->flow;
dp_netdev_flow_used(flow, batch->array.count, batch->byte_count,
- batch->tcp_flags, pmd->ctx.now);
+ batch->tcp_flags, pmd->ctx.now / 1000);
actions = dp_netdev_flow_get_actions(flow);
@@ -5017,6 +5001,9 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
int i;
atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min);
+ pmd_perf_update_counter(&pmd->perf_stats,
+ md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV,
+ cnt);
DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) {
struct dp_netdev_flow *flow;
@@ -5065,18 +5052,17 @@ emc_processing(struct dp_netdev_pmd_thread *pmd,
}
}
- dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT,
- cnt - n_dropped - n_missed);
+ pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT,
+ cnt - n_dropped - n_missed);
return dp_packet_batch_size(packets_);
}
-static inline void
+static inline int
handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
struct dp_packet *packet,
const struct netdev_flow_key *key,
- struct ofpbuf *actions, struct ofpbuf *put_actions,
- int *lost_cnt)
+ struct ofpbuf *actions, struct ofpbuf *put_actions)
{
struct ofpbuf *add_actions;
struct dp_packet_batch b;
@@ -5096,8 +5082,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
put_actions);
if (OVS_UNLIKELY(error && error != ENOSPC)) {
dp_packet_delete(packet);
- (*lost_cnt)++;
- return;
+ return error;
}
/* The Netlink encoding of datapath flow keys cannot express
@@ -5137,6 +5122,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd,
ovs_mutex_unlock(&pmd->flow_mutex);
emc_probabilistic_insert(pmd, key, netdev_flow);
}
+ return error;
}
static inline void
@@ -5158,7 +5144,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
struct dpcls *cls;
struct dpcls_rule *rules[PKT_ARRAY_SIZE];
struct dp_netdev *dp = pmd->dp;
- int miss_cnt = 0, lost_cnt = 0;
+ int upcall_ok_cnt = 0, upcall_fail_cnt = 0;
int lookup_cnt = 0, add_lookup_cnt;
bool any_miss;
size_t i;
@@ -5200,9 +5186,14 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
continue;
}
- miss_cnt++;
- handle_packet_upcall(pmd, packet, &keys[i], &actions,
- &put_actions, &lost_cnt);
+ int error = handle_packet_upcall(pmd, packet, &keys[i],
+ &actions, &put_actions);
+
+ if (OVS_UNLIKELY(error)) {
+ upcall_fail_cnt++;
+ } else {
+ upcall_ok_cnt++;
+ }
}
ofpbuf_uninit(&actions);
@@ -5212,8 +5203,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
if (OVS_UNLIKELY(!rules[i])) {
dp_packet_delete(packet);
- lost_cnt++;
- miss_cnt++;
+ upcall_fail_cnt++;
}
}
}
@@ -5231,10 +5221,14 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd,
dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches);
}
- dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt);
- dp_netdev_count_packet(pmd, DP_STAT_LOOKUP_HIT, lookup_cnt);
- dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt);
- dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt);
+ pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT,
+ cnt - upcall_ok_cnt - upcall_fail_cnt);
+ pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP,
+ lookup_cnt);
+ pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS,
+ upcall_ok_cnt);
+ pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST,
+ upcall_fail_cnt);
}
/* Packets enter the datapath from a port (or from recirculation) here.
@@ -5338,7 +5332,7 @@ dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd,
continue;
}
interval = pmd->ctx.now - tx->last_used;
- if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) {
+ if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) {
port = tx->port;
ovs_mutex_lock(&port->txq_used_mutex);
port->txq_used[tx->qid]--;
@@ -5359,7 +5353,7 @@ dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd,
interval = pmd->ctx.now - tx->last_used;
tx->last_used = pmd->ctx.now;
- if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) {
+ if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) {
return tx->qid;
}
@@ -5491,13 +5485,19 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
dp_netdev_pmd_flush_output_on_port(pmd, p);
}
#endif
- if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts)
- + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST)) {
- /* Some packets was generated while input batch processing.
- * Flush here to avoid overflow. */
+ if (dp_packet_batch_size(&p->output_pkts)
+ + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) {
+ /* Flush here to avoid overflow. */
dp_netdev_pmd_flush_output_on_port(pmd, p);
}
+
+ if (dp_packet_batch_is_empty(&p->output_pkts)) {
+ pmd->n_output_batches++;
+ }
+
DP_PACKET_BATCH_FOR_EACH (packet, packets_) {
+ p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] =
+ pmd->ctx.last_rxq;
dp_packet_batch_add(&p->output_pkts, packet);
}
return;
@@ -5738,7 +5738,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_,
conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force,
commit, zone, setmark, setlabel, aux->flow->tp_src,
aux->flow->tp_dst, helper, nat_action_info_ref,
- pmd->ctx.now);
+ pmd->ctx.now / 1000);
break;
}
@@ -6135,6 +6135,7 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
struct dpcls *cls;
if (pmd->ctx.now > pmd->rxq_next_cycle_store) {
+ uint64_t curr_tsc;
/* Get the cycles that were used to process each queue and store. */
for (unsigned i = 0; i < poll_cnt; i++) {
uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq,
@@ -6143,6 +6144,13 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd,
dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR,
0);
}
+ curr_tsc = cycles_counter_update(&pmd->perf_stats);
+ if (pmd->intrvl_tsc_prev) {
+ /* There is a prev timestamp, store a new intrvl cycle count. */
+ atomic_store_relaxed(&pmd->intrvl_cycles,
+ curr_tsc - pmd->intrvl_tsc_prev);
+ }
+ pmd->intrvl_tsc_prev = curr_tsc;
/* Start new measuring interval */
pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN;
}
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index e32c7f678..ac2e38e7e 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -2436,6 +2436,7 @@ netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst)
&policer->app_srtcm_params);
if (err) {
VLOG_ERR("Could not create rte meter for ingress policer");
+ free(policer);
return NULL;
}
@@ -2615,6 +2616,64 @@ netdev_dpdk_update_flags(struct netdev *netdev,
}
static int
+netdev_dpdk_vhost_user_get_status(const struct netdev *netdev,
+ struct smap *args)
+{
+ struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+
+ ovs_mutex_lock(&dev->mutex);
+
+ bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT;
+ smap_add_format(args, "mode", "%s", client_mode ? "client" : "server");
+
+ int vid = netdev_dpdk_get_vid(dev);
+ if (vid < 0) {
+ smap_add_format(args, "status", "disconnected");
+ ovs_mutex_unlock(&dev->mutex);
+ return 0;
+ } else {
+ smap_add_format(args, "status", "connected");
+ }
+
+ char socket_name[PATH_MAX];
+ if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) {
+ smap_add_format(args, "socket", "%s", socket_name);
+ }
+
+ uint64_t features;
+ if (!rte_vhost_get_negotiated_features(vid, &features)) {
+ smap_add_format(args, "features", "0x%016"PRIx64, features);
+ }
+
+ uint16_t mtu;
+ if (!rte_vhost_get_mtu(vid, &mtu)) {
+ smap_add_format(args, "mtu", "%d", mtu);
+ }
+
+ int numa = rte_vhost_get_numa_node(vid);
+ if (numa >= 0) {
+ smap_add_format(args, "numa", "%d", numa);
+ }
+
+ uint16_t vring_num = rte_vhost_get_vring_num(vid);
+ if (vring_num) {
+ smap_add_format(args, "num_of_vrings", "%d", vring_num);
+ }
+
+ for (int i = 0; i < vring_num; i++) {
+ struct rte_vhost_vring vring;
+ char vhost_vring[16];
+
+ rte_vhost_get_vhost_vring(vid, i, &vring);
+ snprintf(vhost_vring, 16, "vring_%d_size", i);
+ smap_add_format(args, vhost_vring, "%d", vring.size);
+ }
+
+ ovs_mutex_unlock(&dev->mutex);
+ return 0;
+}
+
+static int
netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
@@ -3698,7 +3757,7 @@ static const struct netdev_class dpdk_vhost_class =
netdev_dpdk_vhost_get_stats,
NULL,
NULL,
- NULL,
+ netdev_dpdk_vhost_user_get_status,
netdev_dpdk_vhost_reconfigure,
netdev_dpdk_vhost_rxq_recv);
static const struct netdev_class dpdk_vhost_client_class =
@@ -3714,7 +3773,7 @@ static const struct netdev_class dpdk_vhost_client_class =
netdev_dpdk_vhost_get_stats,
NULL,
NULL,
- NULL,
+ netdev_dpdk_vhost_user_get_status,
netdev_dpdk_vhost_client_reconfigure,
netdev_dpdk_vhost_rxq_recv);