diff options
author | Ben Pfaff <blp@ovn.org> | 2018-01-19 12:42:24 -0800 |
---|---|---|
committer | Ben Pfaff <blp@ovn.org> | 2018-01-19 12:42:24 -0800 |
commit | 70bbaa46492d25d94154bebd4563d14d3e629068 (patch) | |
tree | ecf95cebf3b26961f50f67bc73a1fa47fca9811b /lib | |
parent | 2927a4730b1f66078374f6951308196f43a91121 (diff) | |
parent | b2e8b12f8a821905c25295e04b843c0592a44339 (diff) | |
download | openvswitch-70bbaa46492d25d94154bebd4563d14d3e629068.tar.gz |
Merge branch 'dpdk_merge' of https://github.com/istokes/ovs into HEAD
Diffstat (limited to 'lib')
-rw-r--r-- | lib/automake.mk | 2 | ||||
-rw-r--r-- | lib/dpif-netdev-perf.c | 60 | ||||
-rw-r--r-- | lib/dpif-netdev-perf.h | 232 | ||||
-rw-r--r-- | lib/dpif-netdev.c | 654 | ||||
-rw-r--r-- | lib/netdev-dpdk.c | 63 |
5 files changed, 686 insertions, 325 deletions
diff --git a/lib/automake.mk b/lib/automake.mk index 4b38a1163..159319fa0 100644 --- a/lib/automake.mk +++ b/lib/automake.mk @@ -80,6 +80,8 @@ lib_libopenvswitch_la_SOURCES = \ lib/dpdk.h \ lib/dpif-netdev.c \ lib/dpif-netdev.h \ + lib/dpif-netdev-perf.c \ + lib/dpif-netdev-perf.h \ lib/dpif-provider.h \ lib/dpif.c \ lib/dpif.h \ diff --git a/lib/dpif-netdev-perf.c b/lib/dpif-netdev-perf.c new file mode 100644 index 000000000..f06991ad8 --- /dev/null +++ b/lib/dpif-netdev-perf.c @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2017 Ericsson AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include <config.h> + +#include "openvswitch/dynamic-string.h" +#include "openvswitch/vlog.h" +#include "dpif-netdev-perf.h" +#include "timeval.h" + +VLOG_DEFINE_THIS_MODULE(pmd_perf); + +void +pmd_perf_stats_init(struct pmd_perf_stats *s) +{ + memset(s, 0 , sizeof(*s)); +} + +void +pmd_perf_read_counters(struct pmd_perf_stats *s, + uint64_t stats[PMD_N_STATS]) +{ + uint64_t val; + + /* These loops subtracts reference values (.zero[*]) from the counters. + * Since loads and stores are relaxed, it might be possible for a .zero[*] + * value to be more recent than the current value we're reading from the + * counter. This is not a big problem, since these numbers are not + * supposed to be 100% accurate, but we should at least make sure that + * the result is not negative. */ + for (int i = 0; i < PMD_N_STATS; i++) { + atomic_read_relaxed(&s->counters.n[i], &val); + if (val > s->counters.zero[i]) { + stats[i] = val - s->counters.zero[i]; + } else { + stats[i] = 0; + } + } +} + +void +pmd_perf_stats_clear(struct pmd_perf_stats *s) +{ + for (int i = 0; i < PMD_N_STATS; i++) { + atomic_read_relaxed(&s->counters.n[i], &s->counters.zero[i]); + } +} diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h new file mode 100644 index 000000000..5993c25bc --- /dev/null +++ b/lib/dpif-netdev-perf.h @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2017 Ericsson AB. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at: + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifndef DPIF_NETDEV_PERF_H +#define DPIF_NETDEV_PERF_H 1 + +#include <stdbool.h> +#include <stddef.h> +#include <stdint.h> +#include <string.h> +#include <math.h> + +#ifdef DPDK_NETDEV +#include <rte_config.h> +#include <rte_cycles.h> +#endif + +#include "openvswitch/vlog.h" +#include "ovs-atomic.h" +#include "timeval.h" +#include "unixctl.h" +#include "util.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* This module encapsulates data structures and functions to maintain PMD + * performance metrics such as packet counters, execution cycles. It + * provides a clean API for dpif-netdev to initialize, update and read and + * reset these metrics. + */ + +/* Set of counter types maintained in pmd_perf_stats. */ + +enum pmd_stat_type { + PMD_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */ + PMD_STAT_MASKED_HIT, /* Packets that matched in the flow table. */ + PMD_STAT_MISS, /* Packets that did not match and upcall was ok. */ + PMD_STAT_LOST, /* Packets that did not match and upcall failed. */ + /* The above statistics account for the total + * number of packet passes through the datapath + * pipeline and should not be overlapping with each + * other. */ + PMD_STAT_MASKED_LOOKUP, /* Number of subtable lookups for flow table + hits. Each MASKED_HIT hit will have >= 1 + MASKED_LOOKUP(s). */ + PMD_STAT_RECV, /* Packets entering the datapath pipeline from an + * interface. */ + PMD_STAT_RECIRC, /* Packets reentering the datapath pipeline due to + * recirculation. */ + PMD_STAT_SENT_PKTS, /* Packets that have been sent. */ + PMD_STAT_SENT_BATCHES, /* Number of batches sent. */ + PMD_CYCLES_ITER_IDLE, /* Cycles spent in idle iterations. */ + PMD_CYCLES_ITER_BUSY, /* Cycles spent in busy iterations. */ + PMD_N_STATS +}; + +/* Array of PMD counters indexed by enum pmd_stat_type. + * The n[] array contains the actual counter values since initialization + * of the PMD. Counters are atomically updated from the PMD but are + * read and cleared also from other processes. To clear the counters at + * PMD run-time, the current counter values are copied over to the zero[] + * array. To read counters we subtract zero[] value from n[]. */ + +struct pmd_counters { + atomic_uint64_t n[PMD_N_STATS]; /* Value since _init(). */ + uint64_t zero[PMD_N_STATS]; /* Value at last _clear(). */ +}; + +/* Container for all performance metrics of a PMD. + * Part of the struct dp_netdev_pmd_thread. */ + +struct pmd_perf_stats { + /* Start of the current PMD iteration in TSC cycles.*/ + uint64_t start_it_tsc; + /* Latest TSC time stamp taken in PMD. */ + uint64_t last_tsc; + /* If non-NULL, outermost cycle timer currently running in PMD. */ + struct cycle_timer *cur_timer; + /* Set of PMD counters with their zero offsets. */ + struct pmd_counters counters; +}; + +/* Support for accurate timing of PMD execution on TSC clock cycle level. + * These functions are intended to be invoked in the context of pmd threads. */ + +/* Read the TSC cycle register and cache it. Any function not requiring clock + * cycle accuracy should read the cached value using cycles_counter_get() to + * avoid the overhead of reading the TSC register. */ + +static inline uint64_t +cycles_counter_update(struct pmd_perf_stats *s) +{ +#ifdef DPDK_NETDEV + return s->last_tsc = rte_get_tsc_cycles(); +#else + return s->last_tsc = 0; +#endif +} + +static inline uint64_t +cycles_counter_get(struct pmd_perf_stats *s) +{ + return s->last_tsc; +} + +/* A nestable timer for measuring execution time in TSC cycles. + * + * Usage: + * struct cycle_timer timer; + * + * cycle_timer_start(pmd, &timer); + * <Timed execution> + * uint64_t cycles = cycle_timer_stop(pmd, &timer); + * + * The caller must guarantee that a call to cycle_timer_start() is always + * paired with a call to cycle_stimer_stop(). + * + * Is is possible to have nested cycles timers within the timed code. The + * execution time measured by the nested timers is excluded from the time + * measured by the embracing timer. + */ + +struct cycle_timer { + uint64_t start; + uint64_t suspended; + struct cycle_timer *interrupted; +}; + +static inline void +cycle_timer_start(struct pmd_perf_stats *s, + struct cycle_timer *timer) +{ + struct cycle_timer *cur_timer = s->cur_timer; + uint64_t now = cycles_counter_update(s); + + if (cur_timer) { + cur_timer->suspended = now; + } + timer->interrupted = cur_timer; + timer->start = now; + timer->suspended = 0; + s->cur_timer = timer; +} + +static inline uint64_t +cycle_timer_stop(struct pmd_perf_stats *s, + struct cycle_timer *timer) +{ + /* Assert that this is the current cycle timer. */ + ovs_assert(s->cur_timer == timer); + uint64_t now = cycles_counter_update(s); + struct cycle_timer *intr_timer = timer->interrupted; + + if (intr_timer) { + /* Adjust the start offset by the suspended cycles. */ + intr_timer->start += now - intr_timer->suspended; + } + /* Restore suspended timer, if any. */ + s->cur_timer = intr_timer; + return now - timer->start; +} + +void pmd_perf_stats_init(struct pmd_perf_stats *s); +void pmd_perf_stats_clear(struct pmd_perf_stats *s); +void pmd_perf_read_counters(struct pmd_perf_stats *s, + uint64_t stats[PMD_N_STATS]); + +/* PMD performance counters are updated lock-less. For real PMDs + * they are only updated from the PMD thread itself. In the case of the + * NON-PMD they might be updated from multiple threads, but we can live + * with losing a rare update as 100% accuracy is not required. + * However, as counters are read for display from outside the PMD thread + * with e.g. pmd-stats-show, we make sure that the 64-bit read and store + * operations are atomic also on 32-bit systems so that readers cannot + * not read garbage. On 64-bit systems this incurs no overhead. */ + +static inline void +pmd_perf_update_counter(struct pmd_perf_stats *s, + enum pmd_stat_type counter, int delta) +{ + uint64_t tmp; + atomic_read_relaxed(&s->counters.n[counter], &tmp); + tmp += delta; + atomic_store_relaxed(&s->counters.n[counter], tmp); +} + +static inline void +pmd_perf_start_iteration(struct pmd_perf_stats *s) +{ + if (OVS_LIKELY(s->last_tsc)) { + /* We assume here that last_tsc was updated immediately prior at + * the end of the previous iteration, or just before the first + * iteration. */ + s->start_it_tsc = s->last_tsc; + } else { + /* In case last_tsc has never been set before. */ + s->start_it_tsc = cycles_counter_update(s); + } +} + +static inline void +pmd_perf_end_iteration(struct pmd_perf_stats *s, int rx_packets) +{ + uint64_t cycles = cycles_counter_update(s) - s->start_it_tsc; + + if (rx_packets > 0) { + pmd_perf_update_counter(s, PMD_CYCLES_ITER_BUSY, cycles); + } else { + pmd_perf_update_counter(s, PMD_CYCLES_ITER_IDLE, cycles); + } +} + +#ifdef __cplusplus +} +#endif + +#endif /* DPIF_NETDEV_PERF_H */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index c7d157ab6..f94b189e0 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -32,10 +32,6 @@ #include <sys/stat.h> #include <unistd.h> -#ifdef DPDK_NETDEV -#include <rte_cycles.h> -#endif - #include "bitmap.h" #include "cmap.h" #include "conntrack.h" @@ -44,6 +40,7 @@ #include "csum.h" #include "dp-packet.h" #include "dpif.h" +#include "dpif-netdev-perf.h" #include "dpif-provider.h" #include "dummy.h" #include "fat-rwlock.h" @@ -86,6 +83,9 @@ VLOG_DEFINE_THIS_MODULE(dpif_netdev); #define MAX_RECIRC_DEPTH 6 DEFINE_STATIC_PER_THREAD_DATA(uint32_t, recirc_depth, 0) +/* Use instant packet send by default. */ +#define DEFAULT_TX_FLUSH_INTERVAL 0 + /* Configuration parameters. */ enum { MAX_FLOWS = 65536 }; /* Maximum number of flows in flow table. */ enum { MAX_METERS = 65536 }; /* Maximum number of meters. */ @@ -179,12 +179,13 @@ struct emc_cache { /* Simple non-wildcarding single-priority classifier. */ -/* Time in ms between successive optimizations of the dpcls subtable vector */ -#define DPCLS_OPTIMIZATION_INTERVAL 1000 +/* Time in microseconds between successive optimizations of the dpcls + * subtable vector */ +#define DPCLS_OPTIMIZATION_INTERVAL 1000000LL -/* Time in ms of the interval in which rxq processing cycles used in - * rxq to pmd assignments is measured and stored. */ -#define PMD_RXQ_INTERVAL_LEN 10000 +/* Time in microseconds of the interval in which rxq processing cycles used + * in rxq to pmd assignments is measured and stored. */ +#define PMD_RXQ_INTERVAL_LEN 10000000LL /* Number of intervals for which cycles are stored * and used during rxq to pmd assignment. */ @@ -271,6 +272,9 @@ struct dp_netdev { struct hmap ports; struct seq *port_seq; /* Incremented whenever a port changes. */ + /* The time that a packet can wait in output batch for sending. */ + atomic_uint32_t tx_flush_interval; + /* Meters. */ struct ovs_mutex meter_locks[N_METER_LOCKS]; struct dp_meter *meters[MAX_METERS]; /* Meter bands. */ @@ -331,25 +335,6 @@ static struct dp_netdev_port *dp_netdev_lookup_port(const struct dp_netdev *dp, odp_port_t) OVS_REQUIRES(dp->port_mutex); -enum dp_stat_type { - DP_STAT_EXACT_HIT, /* Packets that had an exact match (emc). */ - DP_STAT_MASKED_HIT, /* Packets that matched in the flow table. */ - DP_STAT_MISS, /* Packets that did not match. */ - DP_STAT_LOST, /* Packets not passed up to the client. */ - DP_STAT_LOOKUP_HIT, /* Number of subtable lookups for flow table - hits */ - DP_STAT_SENT_PKTS, /* Packets that has been sent. */ - DP_STAT_SENT_BATCHES, /* Number of batches sent. */ - DP_N_STATS -}; - -enum pmd_cycles_counter_type { - PMD_CYCLES_IDLE, /* Cycles spent idle or unsuccessful polling */ - PMD_CYCLES_PROCESSING, /* Cycles spent successfully polling and - * processing polled packets */ - PMD_N_CYCLES -}; - enum rxq_cycles_counter_type { RXQ_CYCLES_PROC_CURR, /* Cycles spent successfully polling and processing packets during the current @@ -359,7 +344,7 @@ enum rxq_cycles_counter_type { RXQ_N_CYCLES }; -#define XPS_TIMEOUT_MS 500LL +#define XPS_TIMEOUT 500000LL /* In microseconds. */ /* Contained by struct dp_netdev_port's 'rxqs' member. */ struct dp_netdev_rxq { @@ -499,21 +484,6 @@ struct dp_netdev_actions *dp_netdev_flow_get_actions( const struct dp_netdev_flow *); static void dp_netdev_actions_free(struct dp_netdev_actions *); -/* Contained by struct dp_netdev_pmd_thread's 'stats' member. */ -struct dp_netdev_pmd_stats { - /* Indexed by DP_STAT_*. */ - atomic_ullong n[DP_N_STATS]; -}; - -/* Contained by struct dp_netdev_pmd_thread's 'cycle' member. */ -struct dp_netdev_pmd_cycles { - /* Indexed by PMD_CYCLES_*. */ - atomic_ullong n[PMD_N_CYCLES]; -}; - -static void dp_netdev_count_packet(struct dp_netdev_pmd_thread *, - enum dp_stat_type type, int cnt); - struct polled_queue { struct dp_netdev_rxq *rxq; odp_port_t port_no; @@ -532,7 +502,9 @@ struct tx_port { int qid; long long last_used; struct hmap_node node; + long long flush_time; struct dp_packet_batch output_pkts; + struct dp_netdev_rxq *output_pkts_rxqs[NETDEV_MAX_BURST]; }; /* A set of properties for the current processing loop that is not directly @@ -542,8 +514,8 @@ struct tx_port { struct dp_netdev_pmd_thread_ctx { /* Latest measured time. See 'pmd_thread_ctx_time_update()'. */ long long now; - /* Used to count cycles. See 'cycles_count_end()' */ - unsigned long long last_cycles; + /* RX queue from which last packet was received. */ + struct dp_netdev_rxq *last_rxq; }; /* PMD: Poll modes drivers. PMD accesses devices via polling to eliminate @@ -595,11 +567,10 @@ struct dp_netdev_pmd_thread { are stored for each polled rxq. */ long long int rxq_next_cycle_store; - /* Statistics. */ - struct dp_netdev_pmd_stats stats; - - /* Cycles counters */ - struct dp_netdev_pmd_cycles cycles; + /* Last interval timestamp. */ + uint64_t intrvl_tsc_prev; + /* Last interval cycles. */ + atomic_ullong intrvl_cycles; /* Current context of the PMD thread. */ struct dp_netdev_pmd_thread_ctx ctx; @@ -618,6 +589,9 @@ struct dp_netdev_pmd_thread { * than 'cmap_count(dp->poll_threads)'. */ uint32_t static_tx_qid; + /* Number of filled output batches. */ + int n_output_batches; + struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */ /* List of rx queues to poll. */ struct hmap poll_list OVS_GUARDED; @@ -638,12 +612,8 @@ struct dp_netdev_pmd_thread { struct hmap tnl_port_cache; struct hmap send_port_cache; - /* Only a pmd thread can write on its own 'cycles' and 'stats'. - * The main thread keeps 'stats_zero' and 'cycles_zero' as base - * values and subtracts them from 'stats' and 'cycles' before - * reporting to the user */ - unsigned long long stats_zero[DP_N_STATS]; - uint64_t cycles_zero[PMD_N_CYCLES]; + /* Keep track of detailed PMD performance statistics. */ + struct pmd_perf_stats perf_stats; /* Set to true if the pmd thread needs to be reloaded. */ bool need_reload; @@ -711,8 +681,9 @@ static void dp_netdev_add_rxq_to_pmd(struct dp_netdev_pmd_thread *pmd, static void dp_netdev_del_rxq_from_pmd(struct dp_netdev_pmd_thread *pmd, struct rxq_poll *poll) OVS_REQUIRES(pmd->port_mutex); -static void -dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd); +static int +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, + bool force); static void reconfigure_datapath(struct dp_netdev *dp) OVS_REQUIRES(dp->port_mutex); @@ -803,7 +774,7 @@ emc_cache_slow_sweep(struct emc_cache *flow_cache) static inline void pmd_thread_ctx_time_update(struct dp_netdev_pmd_thread *pmd) { - pmd->ctx.now = time_msec(); + pmd->ctx.now = time_usec(); } /* Returns true if 'dpif' is a netdev or dummy dpif, false otherwise. */ @@ -833,47 +804,10 @@ enum pmd_info_type { }; static void -pmd_info_show_stats(struct ds *reply, - struct dp_netdev_pmd_thread *pmd, - unsigned long long stats[DP_N_STATS], - uint64_t cycles[PMD_N_CYCLES]) +format_pmd_thread(struct ds *reply, struct dp_netdev_pmd_thread *pmd) { - unsigned long long total_packets; - uint64_t total_cycles = 0; - double lookups_per_hit = 0, packets_per_batch = 0; - int i; - - /* These loops subtracts reference values ('*_zero') from the counters. - * Since loads and stores are relaxed, it might be possible for a '*_zero' - * value to be more recent than the current value we're reading from the - * counter. This is not a big problem, since these numbers are not - * supposed to be too accurate, but we should at least make sure that - * the result is not negative. */ - for (i = 0; i < DP_N_STATS; i++) { - if (stats[i] > pmd->stats_zero[i]) { - stats[i] -= pmd->stats_zero[i]; - } else { - stats[i] = 0; - } - } - - /* Sum of all the matched and not matched packets gives the total. */ - total_packets = stats[DP_STAT_EXACT_HIT] + stats[DP_STAT_MASKED_HIT] - + stats[DP_STAT_MISS]; - - for (i = 0; i < PMD_N_CYCLES; i++) { - if (cycles[i] > pmd->cycles_zero[i]) { - cycles[i] -= pmd->cycles_zero[i]; - } else { - cycles[i] = 0; - } - - total_cycles += cycles[i]; - } - ds_put_cstr(reply, (pmd->core_id == NON_PMD_CORE_ID) ? "main thread" : "pmd thread"); - if (pmd->numa_id != OVS_NUMA_UNSPEC) { ds_put_format(reply, " numa_id %d", pmd->numa_id); } @@ -881,23 +815,52 @@ pmd_info_show_stats(struct ds *reply, ds_put_format(reply, " core_id %u", pmd->core_id); } ds_put_cstr(reply, ":\n"); +} + +static void +pmd_info_show_stats(struct ds *reply, + struct dp_netdev_pmd_thread *pmd) +{ + uint64_t stats[PMD_N_STATS]; + uint64_t total_cycles, total_packets; + double passes_per_pkt = 0; + double lookups_per_hit = 0; + double packets_per_batch = 0; + + pmd_perf_read_counters(&pmd->perf_stats, stats); + total_cycles = stats[PMD_CYCLES_ITER_IDLE] + + stats[PMD_CYCLES_ITER_BUSY]; + total_packets = stats[PMD_STAT_RECV]; - if (stats[DP_STAT_MASKED_HIT] > 0) { - lookups_per_hit = stats[DP_STAT_LOOKUP_HIT] - / (double) stats[DP_STAT_MASKED_HIT]; + format_pmd_thread(reply, pmd); + + if (total_packets > 0) { + passes_per_pkt = (total_packets + stats[PMD_STAT_RECIRC]) + / (double) total_packets; + } + if (stats[PMD_STAT_MASKED_HIT] > 0) { + lookups_per_hit = stats[PMD_STAT_MASKED_LOOKUP] + / (double) stats[PMD_STAT_MASKED_HIT]; } - if (stats[DP_STAT_SENT_BATCHES] > 0) { - packets_per_batch = stats[DP_STAT_SENT_PKTS] - / (double) stats[DP_STAT_SENT_BATCHES]; + if (stats[PMD_STAT_SENT_BATCHES] > 0) { + packets_per_batch = stats[PMD_STAT_SENT_PKTS] + / (double) stats[PMD_STAT_SENT_BATCHES]; } ds_put_format(reply, - "\temc hits:%llu\n\tmegaflow hits:%llu\n" - "\tavg. subtable lookups per hit:%.2f\n" - "\tmiss:%llu\n\tlost:%llu\n" - "\tavg. packets per output batch: %.2f\n", - stats[DP_STAT_EXACT_HIT], stats[DP_STAT_MASKED_HIT], - lookups_per_hit, stats[DP_STAT_MISS], stats[DP_STAT_LOST], + "\tpackets received: %"PRIu64"\n" + "\tpacket recirculations: %"PRIu64"\n" + "\tavg. datapath passes per packet: %.02f\n" + "\temc hits: %"PRIu64"\n" + "\tmegaflow hits: %"PRIu64"\n" + "\tavg. subtable lookups per megaflow hit: %.02f\n" + "\tmiss with success upcall: %"PRIu64"\n" + "\tmiss with failed upcall: %"PRIu64"\n" + "\tavg. packets per output batch: %.02f\n", + total_packets, stats[PMD_STAT_RECIRC], + passes_per_pkt, stats[PMD_STAT_EXACT_HIT], + stats[PMD_STAT_MASKED_HIT], lookups_per_hit, + stats[PMD_STAT_MISS], stats[PMD_STAT_LOST], packets_per_batch); if (total_cycles == 0) { @@ -905,48 +868,27 @@ pmd_info_show_stats(struct ds *reply, } ds_put_format(reply, - "\tidle cycles:%"PRIu64" (%.02f%%)\n" - "\tprocessing cycles:%"PRIu64" (%.02f%%)\n", - cycles[PMD_CYCLES_IDLE], - cycles[PMD_CYCLES_IDLE] / (double)total_cycles * 100, - cycles[PMD_CYCLES_PROCESSING], - cycles[PMD_CYCLES_PROCESSING] / (double)total_cycles * 100); + "\tidle cycles: %"PRIu64" (%.02f%%)\n" + "\tprocessing cycles: %"PRIu64" (%.02f%%)\n", + stats[PMD_CYCLES_ITER_IDLE], + stats[PMD_CYCLES_ITER_IDLE] / (double) total_cycles * 100, + stats[PMD_CYCLES_ITER_BUSY], + stats[PMD_CYCLES_ITER_BUSY] / (double) total_cycles * 100); if (total_packets == 0) { return; } ds_put_format(reply, - "\tavg cycles per packet: %.02f (%"PRIu64"/%llu)\n", - total_cycles / (double)total_packets, + "\tavg cycles per packet: %.02f (%"PRIu64"/%"PRIu64")\n", + total_cycles / (double) total_packets, total_cycles, total_packets); ds_put_format(reply, "\tavg processing cycles per packet: " - "%.02f (%"PRIu64"/%llu)\n", - cycles[PMD_CYCLES_PROCESSING] / (double)total_packets, - cycles[PMD_CYCLES_PROCESSING], total_packets); -} - -static void -pmd_info_clear_stats(struct ds *reply OVS_UNUSED, - struct dp_netdev_pmd_thread *pmd, - unsigned long long stats[DP_N_STATS], - uint64_t cycles[PMD_N_CYCLES]) -{ - int i; - - /* We cannot write 'stats' and 'cycles' (because they're written by other - * threads) and we shouldn't change 'stats' (because they're used to count - * datapath stats, which must not be cleared here). Instead, we save the - * current values and subtract them from the values to be displayed in the - * future */ - for (i = 0; i < DP_N_STATS; i++) { - pmd->stats_zero[i] = stats[i]; - } - for (i = 0; i < PMD_N_CYCLES; i++) { - pmd->cycles_zero[i] = cycles[i]; - } + "%.02f (%"PRIu64"/%"PRIu64")\n", + stats[PMD_CYCLES_ITER_BUSY] / (double) total_packets, + stats[PMD_CYCLES_ITER_BUSY], total_packets); } static int @@ -995,9 +937,9 @@ static void pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) { if (pmd->core_id != NON_PMD_CORE_ID) { - const char *prev_name = NULL; struct rxq_poll *list; - size_t i, n; + size_t n_rxq; + uint64_t total_cycles = 0; ds_put_format(reply, "pmd thread numa_id %d core_id %u:\n\tisolated : %s\n", @@ -1005,22 +947,34 @@ pmd_info_show_rxq(struct ds *reply, struct dp_netdev_pmd_thread *pmd) ? "true" : "false"); ovs_mutex_lock(&pmd->port_mutex); - sorted_poll_list(pmd, &list, &n); - for (i = 0; i < n; i++) { - const char *name = netdev_rxq_get_name(list[i].rxq->rx); + sorted_poll_list(pmd, &list, &n_rxq); - if (!prev_name || strcmp(name, prev_name)) { - if (prev_name) { - ds_put_cstr(reply, "\n"); - } - ds_put_format(reply, "\tport: %s\tqueue-id:", name); + /* Get the total pmd cycles for an interval. */ + atomic_read_relaxed(&pmd->intrvl_cycles, &total_cycles); + /* Estimate the cycles to cover all intervals. */ + total_cycles *= PMD_RXQ_INTERVAL_MAX; + + for (int i = 0; i < n_rxq; i++) { + struct dp_netdev_rxq *rxq = list[i].rxq; + const char *name = netdev_rxq_get_name(rxq->rx); + uint64_t proc_cycles = 0; + + for (int j = 0; j < PMD_RXQ_INTERVAL_MAX; j++) { + proc_cycles += dp_netdev_rxq_get_intrvl_cycles(rxq, j); } - ds_put_format(reply, " %d", + ds_put_format(reply, "\tport: %-16s\tqueue-id: %2d", name, netdev_rxq_get_queue_id(list[i].rxq->rx)); - prev_name = name; + ds_put_format(reply, "\tpmd usage: "); + if (total_cycles) { + ds_put_format(reply, "%2"PRIu64"", + proc_cycles * 100 / total_cycles); + ds_put_cstr(reply, " %"); + } else { + ds_put_format(reply, "%s", "NOT AVAIL"); + } + ds_put_cstr(reply, "\n"); } ovs_mutex_unlock(&pmd->port_mutex); - ds_put_cstr(reply, "\n"); free(list); } } @@ -1106,23 +1060,37 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], struct ds reply = DS_EMPTY_INITIALIZER; struct dp_netdev_pmd_thread **pmd_list; struct dp_netdev *dp = NULL; - size_t n; enum pmd_info_type type = *(enum pmd_info_type *) aux; + unsigned int core_id; + bool filter_on_pmd = false; + size_t n; ovs_mutex_lock(&dp_netdev_mutex); - if (argc == 2) { - dp = shash_find_data(&dp_netdevs, argv[1]); - } else if (shash_count(&dp_netdevs) == 1) { - /* There's only one datapath */ - dp = shash_first(&dp_netdevs)->data; + while (argc > 1) { + if (!strcmp(argv[1], "-pmd") && argc >= 3) { + if (str_to_uint(argv[2], 10, &core_id)) { + filter_on_pmd = true; + } + argc -= 2; + argv += 2; + } else { + dp = shash_find_data(&dp_netdevs, argv[1]); + argc -= 1; + argv += 1; + } } if (!dp) { - ovs_mutex_unlock(&dp_netdev_mutex); - unixctl_command_reply_error(conn, - "please specify an existing datapath"); - return; + if (shash_count(&dp_netdevs) == 1) { + /* There's only one datapath */ + dp = shash_first(&dp_netdevs)->data; + } else { + ovs_mutex_unlock(&dp_netdev_mutex); + unixctl_command_reply_error(conn, + "please specify an existing datapath"); + return; + } } sorted_poll_thread_list(dp, &pmd_list, &n); @@ -1131,26 +1099,15 @@ dpif_netdev_pmd_info(struct unixctl_conn *conn, int argc, const char *argv[], if (!pmd) { break; } - + if (filter_on_pmd && pmd->core_id != core_id) { + continue; + } if (type == PMD_INFO_SHOW_RXQ) { pmd_info_show_rxq(&reply, pmd); - } else { - unsigned long long stats[DP_N_STATS]; - uint64_t cycles[PMD_N_CYCLES]; - - /* Read current stats and cycle counters */ - for (size_t j = 0; j < ARRAY_SIZE(stats); j++) { - atomic_read_relaxed(&pmd->stats.n[j], &stats[j]); - } - for (size_t j = 0; j < ARRAY_SIZE(cycles); j++) { - atomic_read_relaxed(&pmd->cycles.n[j], &cycles[j]); - } - - if (type == PMD_INFO_CLEAR_STATS) { - pmd_info_clear_stats(&reply, pmd, stats, cycles); - } else if (type == PMD_INFO_SHOW_STATS) { - pmd_info_show_stats(&reply, pmd, stats, cycles); - } + } else if (type == PMD_INFO_CLEAR_STATS) { + pmd_perf_stats_clear(&pmd->perf_stats); + } else if (type == PMD_INFO_SHOW_STATS) { + pmd_info_show_stats(&reply, pmd); } } free(pmd_list); @@ -1168,14 +1125,14 @@ dpif_netdev_init(void) clear_aux = PMD_INFO_CLEAR_STATS, poll_aux = PMD_INFO_SHOW_RXQ; - unixctl_command_register("dpif-netdev/pmd-stats-show", "[dp]", - 0, 1, dpif_netdev_pmd_info, + unixctl_command_register("dpif-netdev/pmd-stats-show", "[-pmd core] [dp]", + 0, 3, dpif_netdev_pmd_info, (void *)&show_aux); - unixctl_command_register("dpif-netdev/pmd-stats-clear", "[dp]", - 0, 1, dpif_netdev_pmd_info, + unixctl_command_register("dpif-netdev/pmd-stats-clear", "[-pmd core] [dp]", + 0, 3, dpif_netdev_pmd_info, (void *)&clear_aux); - unixctl_command_register("dpif-netdev/pmd-rxq-show", "[dp]", - 0, 1, dpif_netdev_pmd_info, + unixctl_command_register("dpif-netdev/pmd-rxq-show", "[-pmd core] [dp]", + 0, 3, dpif_netdev_pmd_info, (void *)&poll_aux); unixctl_command_register("dpif-netdev/pmd-rxq-rebalance", "[dp]", 0, 1, dpif_netdev_pmd_rebalance, @@ -1312,6 +1269,7 @@ create_dp_netdev(const char *name, const struct dpif_class *class, conntrack_init(&dp->conntrack); atomic_init(&dp->emc_insert_min, DEFAULT_EM_FLOW_INSERT_MIN); + atomic_init(&dp->tx_flush_interval, DEFAULT_TX_FLUSH_INTERVAL); cmap_init(&dp->poll_threads); @@ -1511,20 +1469,16 @@ dpif_netdev_get_stats(const struct dpif *dpif, struct dpif_dp_stats *stats) { struct dp_netdev *dp = get_dp_netdev(dpif); struct dp_netdev_pmd_thread *pmd; + uint64_t pmd_stats[PMD_N_STATS]; stats->n_flows = stats->n_hit = stats->n_missed = stats->n_lost = 0; CMAP_FOR_EACH (pmd, node, &dp->poll_threads) { - unsigned long long n; stats->n_flows += cmap_count(&pmd->flow_table); - - atomic_read_relaxed(&pmd->stats.n[DP_STAT_MASKED_HIT], &n); - stats->n_hit += n; - atomic_read_relaxed(&pmd->stats.n[DP_STAT_EXACT_HIT], &n); - stats->n_hit += n; - atomic_read_relaxed(&pmd->stats.n[DP_STAT_MISS], &n); - stats->n_missed += n; - atomic_read_relaxed(&pmd->stats.n[DP_STAT_LOST], &n); - stats->n_lost += n; + pmd_perf_read_counters(&pmd->perf_stats, pmd_stats); + stats->n_hit += pmd_stats[PMD_STAT_EXACT_HIT]; + stats->n_hit += pmd_stats[PMD_STAT_MASKED_HIT]; + stats->n_missed += pmd_stats[PMD_STAT_MISS]; + stats->n_lost += pmd_stats[PMD_STAT_LOST]; } stats->n_masks = UINT32_MAX; stats->n_mask_hit = UINT64_MAX; @@ -2982,7 +2936,7 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute) dp_packet_batch_init_packet(&pp, execute->packet); dp_netdev_execute_actions(pmd, &pp, false, execute->flow, execute->actions, execute->actions_len); - dp_netdev_pmd_flush_output_packets(pmd); + dp_netdev_pmd_flush_output_packets(pmd, true); if (pmd->core_id == NON_PMD_CORE_ID) { ovs_mutex_unlock(&dp->non_pmd_mutex); @@ -3031,6 +2985,16 @@ dpif_netdev_set_config(struct dpif *dpif, const struct smap *other_config) smap_get_ullong(other_config, "emc-insert-inv-prob", DEFAULT_EM_FLOW_INSERT_INV_PROB); uint32_t insert_min, cur_min; + uint32_t tx_flush_interval, cur_tx_flush_interval; + + tx_flush_interval = smap_get_int(other_config, "tx-flush-interval", + DEFAULT_TX_FLUSH_INTERVAL); + atomic_read_relaxed(&dp->tx_flush_interval, &cur_tx_flush_interval); + if (tx_flush_interval != cur_tx_flush_interval) { + atomic_store_relaxed(&dp->tx_flush_interval, tx_flush_interval); + VLOG_INFO("Flushing interval for tx queues set to %"PRIu32" us", + tx_flush_interval); + } if (!nullable_string_is_equal(dp->pmd_cmask, cmask)) { free(dp->pmd_cmask); @@ -3184,64 +3148,20 @@ dp_netdev_actions_free(struct dp_netdev_actions *actions) free(actions); } -static inline unsigned long long -cycles_counter(void) -{ -#ifdef DPDK_NETDEV - return rte_get_tsc_cycles(); -#else - return 0; -#endif -} - -/* Fake mutex to make sure that the calls to cycles_count_* are balanced */ -extern struct ovs_mutex cycles_counter_fake_mutex; - -/* Start counting cycles. Must be followed by 'cycles_count_end()' */ -static inline void -cycles_count_start(struct dp_netdev_pmd_thread *pmd) - OVS_ACQUIRES(&cycles_counter_fake_mutex) - OVS_NO_THREAD_SAFETY_ANALYSIS -{ - pmd->ctx.last_cycles = cycles_counter(); -} - -/* Stop counting cycles and add them to the counter 'type' */ -static inline void -cycles_count_end(struct dp_netdev_pmd_thread *pmd, - enum pmd_cycles_counter_type type) - OVS_RELEASES(&cycles_counter_fake_mutex) - OVS_NO_THREAD_SAFETY_ANALYSIS -{ - unsigned long long interval = cycles_counter() - pmd->ctx.last_cycles; - - non_atomic_ullong_add(&pmd->cycles.n[type], interval); -} - -/* Calculate the intermediate cycle result and add to the counter 'type' */ -static inline void -cycles_count_intermediate(struct dp_netdev_pmd_thread *pmd, - struct dp_netdev_rxq *rxq, - enum pmd_cycles_counter_type type) - OVS_NO_THREAD_SAFETY_ANALYSIS +static void +dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, + enum rxq_cycles_counter_type type, + unsigned long long cycles) { - unsigned long long new_cycles = cycles_counter(); - unsigned long long interval = new_cycles - pmd->ctx.last_cycles; - pmd->ctx.last_cycles = new_cycles; - - non_atomic_ullong_add(&pmd->cycles.n[type], interval); - if (rxq && (type == PMD_CYCLES_PROCESSING)) { - /* Add to the amount of current processing cycles. */ - non_atomic_ullong_add(&rxq->cycles[RXQ_CYCLES_PROC_CURR], interval); - } + atomic_store_relaxed(&rx->cycles[type], cycles); } static void -dp_netdev_rxq_set_cycles(struct dp_netdev_rxq *rx, +dp_netdev_rxq_add_cycles(struct dp_netdev_rxq *rx, enum rxq_cycles_counter_type type, unsigned long long cycles) { - atomic_store_relaxed(&rx->cycles[type], cycles); + non_atomic_ullong_add(&rx->cycles[type], cycles); } static uint64_t @@ -3269,13 +3189,19 @@ dp_netdev_rxq_get_intrvl_cycles(struct dp_netdev_rxq *rx, unsigned idx) return processing_cycles; } -static void +static int dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd, struct tx_port *p) { + int i; int tx_qid; int output_cnt; bool dynamic_txqs; + struct cycle_timer timer; + uint64_t cycles; + uint32_t tx_flush_interval; + + cycle_timer_start(&pmd->perf_stats, &timer); dynamic_txqs = p->port->dynamic_txqs; if (dynamic_txqs) { @@ -3285,52 +3211,99 @@ dp_netdev_pmd_flush_output_on_port(struct dp_netdev_pmd_thread *pmd, } output_cnt = dp_packet_batch_size(&p->output_pkts); + ovs_assert(output_cnt > 0); netdev_send(p->port->netdev, tx_qid, &p->output_pkts, dynamic_txqs); dp_packet_batch_init(&p->output_pkts); - dp_netdev_count_packet(pmd, DP_STAT_SENT_PKTS, output_cnt); - dp_netdev_count_packet(pmd, DP_STAT_SENT_BATCHES, 1); + /* Update time of the next flush. */ + atomic_read_relaxed(&pmd->dp->tx_flush_interval, &tx_flush_interval); + p->flush_time = pmd->ctx.now + tx_flush_interval; + + ovs_assert(pmd->n_output_batches > 0); + pmd->n_output_batches--; + + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_PKTS, output_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_SENT_BATCHES, 1); + + /* Distribute send cycles evenly among transmitted packets and assign to + * their respective rx queues. */ + cycles = cycle_timer_stop(&pmd->perf_stats, &timer) / output_cnt; + for (i = 0; i < output_cnt; i++) { + if (p->output_pkts_rxqs[i]) { + dp_netdev_rxq_add_cycles(p->output_pkts_rxqs[i], + RXQ_CYCLES_PROC_CURR, cycles); + } + } + + return output_cnt; } -static void -dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd) +static int +dp_netdev_pmd_flush_output_packets(struct dp_netdev_pmd_thread *pmd, + bool force) { struct tx_port *p; + int output_cnt = 0; + + if (!pmd->n_output_batches) { + return 0; + } HMAP_FOR_EACH (p, node, &pmd->send_port_cache) { - if (!dp_packet_batch_is_empty(&p->output_pkts)) { - dp_netdev_pmd_flush_output_on_port(pmd, p); + if (!dp_packet_batch_is_empty(&p->output_pkts) + && (force || pmd->ctx.now >= p->flush_time)) { + output_cnt += dp_netdev_pmd_flush_output_on_port(pmd, p); } } + return output_cnt; } static int dp_netdev_process_rxq_port(struct dp_netdev_pmd_thread *pmd, - struct netdev_rxq *rx, + struct dp_netdev_rxq *rxq, odp_port_t port_no) { struct dp_packet_batch batch; + struct cycle_timer timer; int error; - int batch_cnt = 0; + int batch_cnt = 0, output_cnt = 0; + uint64_t cycles; + + /* Measure duration for polling and processing rx burst. */ + cycle_timer_start(&pmd->perf_stats, &timer); + pmd->ctx.last_rxq = rxq; dp_packet_batch_init(&batch); - error = netdev_rxq_recv(rx, &batch); + + error = netdev_rxq_recv(rxq->rx, &batch); if (!error) { + /* At least one packet received. */ *recirc_depth_get() = 0; pmd_thread_ctx_time_update(pmd); batch_cnt = batch.count; dp_netdev_input(pmd, &batch, port_no); - dp_netdev_pmd_flush_output_packets(pmd); - } else if (error != EAGAIN && error != EOPNOTSUPP) { - static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); - VLOG_ERR_RL(&rl, "error receiving data from %s: %s", - netdev_rxq_get_name(rx), ovs_strerror(error)); + /* Assign processing cycles to rx queue. */ + cycles = cycle_timer_stop(&pmd->perf_stats, &timer); + dp_netdev_rxq_add_cycles(rxq, RXQ_CYCLES_PROC_CURR, cycles); + + output_cnt = dp_netdev_pmd_flush_output_packets(pmd, false); + } else { + /* Discard cycles. */ + cycle_timer_stop(&pmd->perf_stats, &timer); + if (error != EAGAIN && error != EOPNOTSUPP) { + static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(1, 5); + + VLOG_ERR_RL(&rl, "error receiving data from %s: %s", + netdev_rxq_get_name(rxq->rx), ovs_strerror(error)); + } } - return batch_cnt; + pmd->ctx.last_rxq = NULL; + + return batch_cnt + output_cnt; } static struct tx_port * @@ -3953,31 +3926,33 @@ dpif_netdev_run(struct dpif *dpif) struct dp_netdev *dp = get_dp_netdev(dpif); struct dp_netdev_pmd_thread *non_pmd; uint64_t new_tnl_seq; - int process_packets = 0; + bool need_to_flush = true; ovs_mutex_lock(&dp->port_mutex); non_pmd = dp_netdev_get_pmd(dp, NON_PMD_CORE_ID); if (non_pmd) { ovs_mutex_lock(&dp->non_pmd_mutex); - cycles_count_start(non_pmd); HMAP_FOR_EACH (port, node, &dp->ports) { if (!netdev_is_pmd(port->netdev)) { int i; for (i = 0; i < port->n_rxq; i++) { - process_packets = - dp_netdev_process_rxq_port(non_pmd, - port->rxqs[i].rx, - port->port_no); - cycles_count_intermediate(non_pmd, NULL, - process_packets - ? PMD_CYCLES_PROCESSING - : PMD_CYCLES_IDLE); + if (dp_netdev_process_rxq_port(non_pmd, + &port->rxqs[i], + port->port_no)) { + need_to_flush = false; + } } } } - cycles_count_end(non_pmd, PMD_CYCLES_IDLE); - pmd_thread_ctx_time_update(non_pmd); + if (need_to_flush) { + /* We didn't receive anything in the process loop. + * Check if we need to send something. + * There was no time updates on current iteration. */ + pmd_thread_ctx_time_update(non_pmd); + dp_netdev_pmd_flush_output_packets(non_pmd, false); + } + dpif_netdev_xps_revalidate_pmd(non_pmd, false); ovs_mutex_unlock(&dp->non_pmd_mutex); @@ -4028,6 +4003,8 @@ pmd_free_cached_ports(struct dp_netdev_pmd_thread *pmd) { struct tx_port *tx_port_cached; + /* Flush all the queued packets. */ + dp_netdev_pmd_flush_output_packets(pmd, true); /* Free all used tx queue ids. */ dpif_netdev_xps_revalidate_pmd(pmd, true); @@ -4121,6 +4098,7 @@ static void * pmd_thread_main(void *f_) { struct dp_netdev_pmd_thread *pmd = f_; + struct pmd_perf_stats *s = &pmd->perf_stats; unsigned int lc = 0; struct polled_queue *poll_list; bool exiting; @@ -4144,6 +4122,8 @@ reload: VLOG_DBG("Core %d processing port \'%s\' with queue-id %d\n", pmd->core_id, netdev_rxq_get_name(poll_list[i].rxq->rx), netdev_rxq_get_queue_id(poll_list[i].rxq->rx)); + /* Reset the rxq current cycles counter. */ + dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0); } if (!poll_cnt) { @@ -4154,15 +4134,26 @@ reload: lc = UINT_MAX; } - cycles_count_start(pmd); + pmd->intrvl_tsc_prev = 0; + atomic_store_relaxed(&pmd->intrvl_cycles, 0); + cycles_counter_update(s); for (;;) { + uint64_t iter_packets = 0; + + pmd_perf_start_iteration(s); for (i = 0; i < poll_cnt; i++) { process_packets = - dp_netdev_process_rxq_port(pmd, poll_list[i].rxq->rx, + dp_netdev_process_rxq_port(pmd, poll_list[i].rxq, poll_list[i].port_no); - cycles_count_intermediate(pmd, poll_list[i].rxq, - process_packets ? PMD_CYCLES_PROCESSING - : PMD_CYCLES_IDLE); + iter_packets += process_packets; + } + + if (!iter_packets) { + /* We didn't receive anything in the process loop. + * Check if we need to send something. + * There was no time updates on current iteration. */ + pmd_thread_ctx_time_update(pmd); + iter_packets += dp_netdev_pmd_flush_output_packets(pmd, false); } if (lc++ > 1024) { @@ -4171,9 +4162,6 @@ reload: lc = 0; coverage_try_clear(); - /* It's possible that the time was not updated on current - * iteration, if there were no received packets. */ - pmd_thread_ctx_time_update(pmd); dp_netdev_pmd_try_optimize(pmd, poll_list, poll_cnt); if (!ovsrcu_try_quiesce()) { emc_cache_slow_sweep(&pmd->flow_cache); @@ -4184,10 +4172,9 @@ reload: break; } } + pmd_perf_end_iteration(s, iter_packets); } - cycles_count_end(pmd, PMD_CYCLES_IDLE); - poll_cnt = pmd_load_queues_and_ports(pmd, &poll_list); exiting = latch_is_set(&pmd->exit_latch); /* Signal here to make sure the pmd finishes @@ -4259,7 +4246,7 @@ dp_netdev_run_meter(struct dp_netdev *dp, struct dp_packet_batch *packets_, memset(exceeded_rate, 0, cnt * sizeof *exceeded_rate); /* All packets will hit the meter at the same time. */ - long_delta_t = (now - meter->used); /* msec */ + long_delta_t = (now - meter->used) / 1000; /* msec */ /* Make sure delta_t will not be too large, so that bucket will not * wrap around below. */ @@ -4415,7 +4402,7 @@ dpif_netdev_meter_set(struct dpif *dpif, ofproto_meter_id *meter_id, meter->flags = config->flags; meter->n_bands = config->n_bands; meter->max_delta_t = 0; - meter->used = time_msec(); + meter->used = time_usec(); /* set up bands */ for (i = 0; i < config->n_bands; ++i) { @@ -4613,6 +4600,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, pmd->core_id = core_id; pmd->numa_id = numa_id; pmd->need_reload = false; + pmd->n_output_batches = 0; ovs_refcount_init(&pmd->ref_cnt); latch_init(&pmd->exit_latch); @@ -4625,6 +4613,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, ovs_mutex_init(&pmd->port_mutex); cmap_init(&pmd->flow_table); cmap_init(&pmd->classifiers); + pmd->ctx.last_rxq = NULL; pmd_thread_ctx_time_update(pmd); pmd->next_optimization = pmd->ctx.now + DPCLS_OPTIMIZATION_INTERVAL; pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN; @@ -4638,6 +4627,7 @@ dp_netdev_configure_pmd(struct dp_netdev_pmd_thread *pmd, struct dp_netdev *dp, emc_cache_init(&pmd->flow_cache); pmd_alloc_static_tx_qid(pmd); } + pmd_perf_stats_init(&pmd->perf_stats); cmap_insert(&dp->poll_threads, CONST_CAST(struct cmap_node *, &pmd->node), hash_int(core_id, 0)); } @@ -4800,6 +4790,7 @@ dp_netdev_add_port_tx_to_pmd(struct dp_netdev_pmd_thread *pmd, tx->port = port; tx->qid = -1; + tx->flush_time = 0LL; dp_packet_batch_init(&tx->output_pkts); hmap_insert(&pmd->tx_ports, &tx->node, hash_port_no(tx->port->port_no)); @@ -4838,13 +4829,6 @@ dp_netdev_flow_used(struct dp_netdev_flow *netdev_flow, int cnt, int size, atomic_store_relaxed(&netdev_flow->stats.tcp_flags, flags); } -static void -dp_netdev_count_packet(struct dp_netdev_pmd_thread *pmd, - enum dp_stat_type type, int cnt) -{ - non_atomic_ullong_add(&pmd->stats.n[type], cnt); -} - static int dp_netdev_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet_, struct flow *flow, struct flow_wildcards *wc, ovs_u128 *ufid, @@ -4963,7 +4947,7 @@ packet_batch_per_flow_execute(struct packet_batch_per_flow *batch, struct dp_netdev_flow *flow = batch->flow; dp_netdev_flow_used(flow, batch->array.count, batch->byte_count, - batch->tcp_flags, pmd->ctx.now); + batch->tcp_flags, pmd->ctx.now / 1000); actions = dp_netdev_flow_get_actions(flow); @@ -5017,6 +5001,9 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, int i; atomic_read_relaxed(&pmd->dp->emc_insert_min, &cur_min); + pmd_perf_update_counter(&pmd->perf_stats, + md_is_valid ? PMD_STAT_RECIRC : PMD_STAT_RECV, + cnt); DP_PACKET_BATCH_REFILL_FOR_EACH (i, cnt, packet, packets_) { struct dp_netdev_flow *flow; @@ -5065,18 +5052,17 @@ emc_processing(struct dp_netdev_pmd_thread *pmd, } } - dp_netdev_count_packet(pmd, DP_STAT_EXACT_HIT, - cnt - n_dropped - n_missed); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_EXACT_HIT, + cnt - n_dropped - n_missed); return dp_packet_batch_size(packets_); } -static inline void +static inline int handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, struct dp_packet *packet, const struct netdev_flow_key *key, - struct ofpbuf *actions, struct ofpbuf *put_actions, - int *lost_cnt) + struct ofpbuf *actions, struct ofpbuf *put_actions) { struct ofpbuf *add_actions; struct dp_packet_batch b; @@ -5096,8 +5082,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, put_actions); if (OVS_UNLIKELY(error && error != ENOSPC)) { dp_packet_delete(packet); - (*lost_cnt)++; - return; + return error; } /* The Netlink encoding of datapath flow keys cannot express @@ -5137,6 +5122,7 @@ handle_packet_upcall(struct dp_netdev_pmd_thread *pmd, ovs_mutex_unlock(&pmd->flow_mutex); emc_probabilistic_insert(pmd, key, netdev_flow); } + return error; } static inline void @@ -5158,7 +5144,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, struct dpcls *cls; struct dpcls_rule *rules[PKT_ARRAY_SIZE]; struct dp_netdev *dp = pmd->dp; - int miss_cnt = 0, lost_cnt = 0; + int upcall_ok_cnt = 0, upcall_fail_cnt = 0; int lookup_cnt = 0, add_lookup_cnt; bool any_miss; size_t i; @@ -5200,9 +5186,14 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, continue; } - miss_cnt++; - handle_packet_upcall(pmd, packet, &keys[i], &actions, - &put_actions, &lost_cnt); + int error = handle_packet_upcall(pmd, packet, &keys[i], + &actions, &put_actions); + + if (OVS_UNLIKELY(error)) { + upcall_fail_cnt++; + } else { + upcall_ok_cnt++; + } } ofpbuf_uninit(&actions); @@ -5212,8 +5203,7 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, DP_PACKET_BATCH_FOR_EACH (packet, packets_) { if (OVS_UNLIKELY(!rules[i])) { dp_packet_delete(packet); - lost_cnt++; - miss_cnt++; + upcall_fail_cnt++; } } } @@ -5231,10 +5221,14 @@ fast_path_processing(struct dp_netdev_pmd_thread *pmd, dp_netdev_queue_batches(packet, flow, &keys[i].mf, batches, n_batches); } - dp_netdev_count_packet(pmd, DP_STAT_MASKED_HIT, cnt - miss_cnt); - dp_netdev_count_packet(pmd, DP_STAT_LOOKUP_HIT, lookup_cnt); - dp_netdev_count_packet(pmd, DP_STAT_MISS, miss_cnt); - dp_netdev_count_packet(pmd, DP_STAT_LOST, lost_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_HIT, + cnt - upcall_ok_cnt - upcall_fail_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MASKED_LOOKUP, + lookup_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_MISS, + upcall_ok_cnt); + pmd_perf_update_counter(&pmd->perf_stats, PMD_STAT_LOST, + upcall_fail_cnt); } /* Packets enter the datapath from a port (or from recirculation) here. @@ -5338,7 +5332,7 @@ dpif_netdev_xps_revalidate_pmd(const struct dp_netdev_pmd_thread *pmd, continue; } interval = pmd->ctx.now - tx->last_used; - if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT_MS)) { + if (tx->qid >= 0 && (purge || interval >= XPS_TIMEOUT)) { port = tx->port; ovs_mutex_lock(&port->txq_used_mutex); port->txq_used[tx->qid]--; @@ -5359,7 +5353,7 @@ dpif_netdev_xps_get_tx_qid(const struct dp_netdev_pmd_thread *pmd, interval = pmd->ctx.now - tx->last_used; tx->last_used = pmd->ctx.now; - if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT_MS)) { + if (OVS_LIKELY(tx->qid >= 0 && interval < XPS_TIMEOUT)) { return tx->qid; } @@ -5491,13 +5485,19 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, dp_netdev_pmd_flush_output_on_port(pmd, p); } #endif - if (OVS_UNLIKELY(dp_packet_batch_size(&p->output_pkts) - + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST)) { - /* Some packets was generated while input batch processing. - * Flush here to avoid overflow. */ + if (dp_packet_batch_size(&p->output_pkts) + + dp_packet_batch_size(packets_) > NETDEV_MAX_BURST) { + /* Flush here to avoid overflow. */ dp_netdev_pmd_flush_output_on_port(pmd, p); } + + if (dp_packet_batch_is_empty(&p->output_pkts)) { + pmd->n_output_batches++; + } + DP_PACKET_BATCH_FOR_EACH (packet, packets_) { + p->output_pkts_rxqs[dp_packet_batch_size(&p->output_pkts)] = + pmd->ctx.last_rxq; dp_packet_batch_add(&p->output_pkts, packet); } return; @@ -5738,7 +5738,7 @@ dp_execute_cb(void *aux_, struct dp_packet_batch *packets_, conntrack_execute(&dp->conntrack, packets_, aux->flow->dl_type, force, commit, zone, setmark, setlabel, aux->flow->tp_src, aux->flow->tp_dst, helper, nat_action_info_ref, - pmd->ctx.now); + pmd->ctx.now / 1000); break; } @@ -6135,6 +6135,7 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, struct dpcls *cls; if (pmd->ctx.now > pmd->rxq_next_cycle_store) { + uint64_t curr_tsc; /* Get the cycles that were used to process each queue and store. */ for (unsigned i = 0; i < poll_cnt; i++) { uint64_t rxq_cyc_curr = dp_netdev_rxq_get_cycles(poll_list[i].rxq, @@ -6143,6 +6144,13 @@ dp_netdev_pmd_try_optimize(struct dp_netdev_pmd_thread *pmd, dp_netdev_rxq_set_cycles(poll_list[i].rxq, RXQ_CYCLES_PROC_CURR, 0); } + curr_tsc = cycles_counter_update(&pmd->perf_stats); + if (pmd->intrvl_tsc_prev) { + /* There is a prev timestamp, store a new intrvl cycle count. */ + atomic_store_relaxed(&pmd->intrvl_cycles, + curr_tsc - pmd->intrvl_tsc_prev); + } + pmd->intrvl_tsc_prev = curr_tsc; /* Start new measuring interval */ pmd->rxq_next_cycle_store = pmd->ctx.now + PMD_RXQ_INTERVAL_LEN; } diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index e32c7f678..ac2e38e7e 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -2436,6 +2436,7 @@ netdev_dpdk_policer_construct(uint32_t rate, uint32_t burst) &policer->app_srtcm_params); if (err) { VLOG_ERR("Could not create rte meter for ingress policer"); + free(policer); return NULL; } @@ -2615,6 +2616,64 @@ netdev_dpdk_update_flags(struct netdev *netdev, } static int +netdev_dpdk_vhost_user_get_status(const struct netdev *netdev, + struct smap *args) +{ + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + + ovs_mutex_lock(&dev->mutex); + + bool client_mode = dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT; + smap_add_format(args, "mode", "%s", client_mode ? "client" : "server"); + + int vid = netdev_dpdk_get_vid(dev); + if (vid < 0) { + smap_add_format(args, "status", "disconnected"); + ovs_mutex_unlock(&dev->mutex); + return 0; + } else { + smap_add_format(args, "status", "connected"); + } + + char socket_name[PATH_MAX]; + if (!rte_vhost_get_ifname(vid, socket_name, PATH_MAX)) { + smap_add_format(args, "socket", "%s", socket_name); + } + + uint64_t features; + if (!rte_vhost_get_negotiated_features(vid, &features)) { + smap_add_format(args, "features", "0x%016"PRIx64, features); + } + + uint16_t mtu; + if (!rte_vhost_get_mtu(vid, &mtu)) { + smap_add_format(args, "mtu", "%d", mtu); + } + + int numa = rte_vhost_get_numa_node(vid); + if (numa >= 0) { + smap_add_format(args, "numa", "%d", numa); + } + + uint16_t vring_num = rte_vhost_get_vring_num(vid); + if (vring_num) { + smap_add_format(args, "num_of_vrings", "%d", vring_num); + } + + for (int i = 0; i < vring_num; i++) { + struct rte_vhost_vring vring; + char vhost_vring[16]; + + rte_vhost_get_vhost_vring(vid, i, &vring); + snprintf(vhost_vring, 16, "vring_%d_size", i); + smap_add_format(args, vhost_vring, "%d", vring.size); + } + + ovs_mutex_unlock(&dev->mutex); + return 0; +} + +static int netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); @@ -3698,7 +3757,7 @@ static const struct netdev_class dpdk_vhost_class = netdev_dpdk_vhost_get_stats, NULL, NULL, - NULL, + netdev_dpdk_vhost_user_get_status, netdev_dpdk_vhost_reconfigure, netdev_dpdk_vhost_rxq_recv); static const struct netdev_class dpdk_vhost_client_class = @@ -3714,7 +3773,7 @@ static const struct netdev_class dpdk_vhost_client_class = netdev_dpdk_vhost_get_stats, NULL, NULL, - NULL, + netdev_dpdk_vhost_user_get_status, netdev_dpdk_vhost_client_reconfigure, netdev_dpdk_vhost_rxq_recv); |