summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorBen Pfaff <blp@ovn.org>2017-12-11 10:24:09 -0800
committerBen Pfaff <blp@ovn.org>2017-12-11 10:24:09 -0800
commit8c087cecffb0355680ffcc2ac2debc41ac163ae6 (patch)
treeb782feb589a8066d018af2dca27ad803045f9959 /lib
parent433695320a9ea08f1287e9de0c4eabb29a88483e (diff)
parent3eb8d4fa0db3159a8ffc8f52223417b3417263b3 (diff)
downloadopenvswitch-8c087cecffb0355680ffcc2ac2debc41ac163ae6.tar.gz
Merge branch 'dpdk_merge' of https://github.com/istokes/ovs into HEAD
Diffstat (limited to 'lib')
-rw-r--r--lib/dpdk-stub.c6
-rw-r--r--lib/dpdk.c12
-rw-r--r--lib/dpdk.h3
-rw-r--r--lib/dpif-netdev.c202
-rw-r--r--lib/netdev-dpdk.c70
5 files changed, 169 insertions, 124 deletions
diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c
index daef7291f..36021807c 100644
--- a/lib/dpdk-stub.c
+++ b/lib/dpdk-stub.c
@@ -48,3 +48,9 @@ dpdk_get_vhost_sock_dir(void)
{
return NULL;
}
+
+bool
+dpdk_vhost_iommu_enabled(void)
+{
+ return false;
+}
diff --git a/lib/dpdk.c b/lib/dpdk.c
index 8da6c3244..6710d10fc 100644
--- a/lib/dpdk.c
+++ b/lib/dpdk.c
@@ -41,6 +41,7 @@ VLOG_DEFINE_THIS_MODULE(dpdk);
static FILE *log_stream = NULL; /* Stream for DPDK log redirection */
static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */
+static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */
static int
process_vhost_flags(char *flag, const char *default_val, int size,
@@ -345,6 +346,11 @@ dpdk_init__(const struct smap *ovs_other_config)
vhost_sock_dir = sock_dir_subcomponent;
}
+ vhost_iommu_enabled = smap_get_bool(ovs_other_config,
+ "vhost-iommu-support", false);
+ VLOG_INFO("IOMMU support for vhost-user-client %s.",
+ vhost_iommu_enabled ? "enabled" : "disabled");
+
argv = grow_argv(&argv, 0, 1);
argc = 1;
argv[0] = xstrdup(ovs_get_program_name());
@@ -482,6 +488,12 @@ dpdk_get_vhost_sock_dir(void)
return vhost_sock_dir;
}
+bool
+dpdk_vhost_iommu_enabled(void)
+{
+ return vhost_iommu_enabled;
+}
+
void
dpdk_set_lcore_id(unsigned cpu)
{
diff --git a/lib/dpdk.h b/lib/dpdk.h
index 673a1f17e..dc58d968a 100644
--- a/lib/dpdk.h
+++ b/lib/dpdk.h
@@ -17,6 +17,8 @@
#ifndef DPDK_H
#define DPDK_H
+#include <stdbool.h>
+
#ifdef DPDK_NETDEV
#include <rte_config.h>
@@ -35,5 +37,6 @@ struct smap;
void dpdk_init(const struct smap *ovs_other_config);
void dpdk_set_lcore_id(unsigned cpu);
const char *dpdk_get_vhost_sock_dir(void);
+bool dpdk_vhost_iommu_enabled(void);
#endif /* dpdk.h */
diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c
index b1ef9a6a5..43f6a7857 100644
--- a/lib/dpif-netdev.c
+++ b/lib/dpif-netdev.c
@@ -547,31 +547,18 @@ struct tx_port {
* actions in either case.
* */
struct dp_netdev_pmd_thread {
- PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
- struct dp_netdev *dp;
- struct cmap_node node; /* In 'dp->poll_threads'. */
- pthread_cond_t cond; /* For synchronizing pmd thread
- reload. */
- );
-
- PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1,
- struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
- pthread_t thread;
- unsigned core_id; /* CPU core id of this pmd thread. */
- int numa_id; /* numa node id of this pmd thread. */
- );
+ struct dp_netdev *dp;
+ struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
+ struct cmap_node node; /* In 'dp->poll_threads'. */
+
+ pthread_cond_t cond; /* For synchronizing pmd thread reload. */
+ struct ovs_mutex cond_mutex; /* Mutex for condition variable. */
/* Per thread exact-match cache. Note, the instance for cpu core
* NON_PMD_CORE_ID can be accessed by multiple threads, and thusly
* need to be protected by 'non_pmd_mutex'. Every other instance
* will only be accessed by its own pmd thread. */
- OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct emc_cache flow_cache;
- struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */
-
- /* Queue id used by this pmd thread to send packets on all netdevs if
- * XPS disabled for this netdev. All static_tx_qid's are unique and less
- * than 'cmap_count(dp->poll_threads)'. */
- uint32_t static_tx_qid;
+ struct emc_cache flow_cache;
/* Flow-Table and classifiers
*
@@ -580,77 +567,68 @@ struct dp_netdev_pmd_thread {
* 'flow_mutex'.
*/
struct ovs_mutex flow_mutex;
- PADDED_MEMBERS(CACHE_LINE_SIZE,
- struct cmap flow_table OVS_GUARDED; /* Flow table. */
-
- /* One classifier per in_port polled by the pmd */
- struct cmap classifiers;
- /* Periodically sort subtable vectors according to hit frequencies */
- long long int next_optimization;
- /* End of the next time interval for which processing cycles
- are stored for each polled rxq. */
- long long int rxq_next_cycle_store;
-
- /* Cycles counters */
- struct dp_netdev_pmd_cycles cycles;
-
- /* Used to count cycles. See 'cycles_counter_end()'. */
- unsigned long long last_cycles;
- struct latch exit_latch; /* For terminating the pmd thread. */
- );
-
- PADDED_MEMBERS(CACHE_LINE_SIZE,
- /* Statistics. */
- struct dp_netdev_pmd_stats stats;
-
- struct seq *reload_seq;
- uint64_t last_reload_seq;
- atomic_bool reload; /* Do we need to reload ports? */
- bool isolated;
-
- /* Set to true if the pmd thread needs to be reloaded. */
- bool need_reload;
- /* 5 pad bytes. */
- );
-
- PADDED_MEMBERS(CACHE_LINE_SIZE,
- struct ovs_mutex port_mutex; /* Mutex for 'poll_list'
- and 'tx_ports'. */
- /* 16 pad bytes. */
- );
- PADDED_MEMBERS(CACHE_LINE_SIZE,
- /* List of rx queues to poll. */
- struct hmap poll_list OVS_GUARDED;
- /* Map of 'tx_port's used for transmission. Written by the main
- * thread, read by the pmd thread. */
- struct hmap tx_ports OVS_GUARDED;
- );
- PADDED_MEMBERS(CACHE_LINE_SIZE,
- /* These are thread-local copies of 'tx_ports'. One contains only
- * tunnel ports (that support push_tunnel/pop_tunnel), the other
- * contains ports with at least one txq (that support send).
- * A port can be in both.
- *
- * There are two separate maps to make sure that we don't try to
- * execute OUTPUT on a device which has 0 txqs or PUSH/POP on a
- * non-tunnel device.
- *
- * The instances for cpu core NON_PMD_CORE_ID can be accessed by
- * multiple threads and thusly need to be protected by 'non_pmd_mutex'.
- * Every other instance will only be accessed by its own pmd thread. */
- struct hmap tnl_port_cache;
- struct hmap send_port_cache;
- );
-
- PADDED_MEMBERS(CACHE_LINE_SIZE,
- /* Only a pmd thread can write on its own 'cycles' and 'stats'.
- * The main thread keeps 'stats_zero' and 'cycles_zero' as base
- * values and subtracts them from 'stats' and 'cycles' before
- * reporting to the user */
- unsigned long long stats_zero[DP_N_STATS];
- uint64_t cycles_zero[PMD_N_CYCLES];
- /* 8 pad bytes. */
- );
+ struct cmap flow_table OVS_GUARDED; /* Flow table. */
+
+ /* One classifier per in_port polled by the pmd */
+ struct cmap classifiers;
+ /* Periodically sort subtable vectors according to hit frequencies */
+ long long int next_optimization;
+ /* End of the next time interval for which processing cycles
+ are stored for each polled rxq. */
+ long long int rxq_next_cycle_store;
+
+ /* Statistics. */
+ struct dp_netdev_pmd_stats stats;
+
+ /* Cycles counters */
+ struct dp_netdev_pmd_cycles cycles;
+
+ /* Used to count cicles. See 'cycles_counter_end()' */
+ unsigned long long last_cycles;
+
+ struct latch exit_latch; /* For terminating the pmd thread. */
+ struct seq *reload_seq;
+ uint64_t last_reload_seq;
+ atomic_bool reload; /* Do we need to reload ports? */
+ pthread_t thread;
+ unsigned core_id; /* CPU core id of this pmd thread. */
+ int numa_id; /* numa node id of this pmd thread. */
+ bool isolated;
+
+ /* Queue id used by this pmd thread to send packets on all netdevs if
+ * XPS disabled for this netdev. All static_tx_qid's are unique and less
+ * than 'cmap_count(dp->poll_threads)'. */
+ uint32_t static_tx_qid;
+
+ struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */
+ /* List of rx queues to poll. */
+ struct hmap poll_list OVS_GUARDED;
+ /* Map of 'tx_port's used for transmission. Written by the main thread,
+ * read by the pmd thread. */
+ struct hmap tx_ports OVS_GUARDED;
+
+ /* These are thread-local copies of 'tx_ports'. One contains only tunnel
+ * ports (that support push_tunnel/pop_tunnel), the other contains ports
+ * with at least one txq (that support send). A port can be in both.
+ *
+ * There are two separate maps to make sure that we don't try to execute
+ * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device.
+ *
+ * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple
+ * threads, and thusly need to be protected by 'non_pmd_mutex'. Every
+ * other instance will only be accessed by its own pmd thread. */
+ struct hmap tnl_port_cache;
+ struct hmap send_port_cache;
+
+ /* Only a pmd thread can write on its own 'cycles' and 'stats'.
+ * The main thread keeps 'stats_zero' and 'cycles_zero' as base
+ * values and subtracts them from 'stats' and 'cycles' before
+ * reporting to the user */
+ unsigned long long stats_zero[DP_N_STATS];
+ uint64_t cycles_zero[PMD_N_CYCLES];
+
+ /* Set to true if the pmd thread needs to be reloaded. */
+ bool need_reload;
};
/* Interface to netdev-based datapath. */
@@ -2925,6 +2903,9 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute)
/* If this is part of a probe, Drop the packet, since executing
* the action may actually cause spurious packets be sent into
* the network. */
+ if (pmd->core_id == NON_PMD_CORE_ID) {
+ dp_netdev_pmd_unref(pmd);
+ }
return 0;
}
@@ -3445,28 +3426,31 @@ rr_numa_list_destroy(struct rr_numa_list *rr)
/* Sort Rx Queues by the processing cycles they are consuming. */
static int
-rxq_cycle_sort(const void *a, const void *b)
+compare_rxq_cycles(const void *a, const void *b)
{
struct dp_netdev_rxq *qa;
struct dp_netdev_rxq *qb;
- uint64_t total_qa, total_qb;
- unsigned i;
+ uint64_t cycles_qa, cycles_qb;
qa = *(struct dp_netdev_rxq **) a;
qb = *(struct dp_netdev_rxq **) b;
- total_qa = total_qb = 0;
- for (i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
- total_qa += dp_netdev_rxq_get_intrvl_cycles(qa, i);
- total_qb += dp_netdev_rxq_get_intrvl_cycles(qb, i);
- }
- dp_netdev_rxq_set_cycles(qa, RXQ_CYCLES_PROC_HIST, total_qa);
- dp_netdev_rxq_set_cycles(qb, RXQ_CYCLES_PROC_HIST, total_qb);
+ cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST);
+ cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST);
- if (total_qa >= total_qb) {
- return -1;
+ if (cycles_qa != cycles_qb) {
+ return (cycles_qa < cycles_qb) ? 1 : -1;
+ } else {
+ /* Cycles are the same so tiebreak on port/queue id.
+ * Tiebreaking (as opposed to return 0) ensures consistent
+ * sort results across multiple OS's. */
+ if (qa->port->port_no != qb->port->port_no) {
+ return (qa->port->port_no > qb->port->port_no) ? 1 : -1;
+ } else {
+ return netdev_rxq_get_queue_id(qa->rx)
+ - netdev_rxq_get_queue_id(qb->rx);
+ }
}
- return 1;
}
/* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned
@@ -3511,11 +3495,19 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
dp_netdev_pmd_unref(pmd);
}
} else if (!pinned && q->core_id == OVS_CORE_UNSPEC) {
+ uint64_t cycle_hist = 0;
+
if (n_rxqs == 0) {
rxqs = xmalloc(sizeof *rxqs);
} else {
rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1));
}
+ /* Sum the queue intervals and store the cycle history. */
+ for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) {
+ cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i);
+ }
+ dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, cycle_hist);
+
/* Store the queue. */
rxqs[n_rxqs++] = q;
}
@@ -3525,7 +3517,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex)
if (n_rxqs > 1) {
/* Sort the queues in order of the processing cycles
* they consumed during their last pmd interval. */
- qsort(rxqs, n_rxqs, sizeof *rxqs, rxq_cycle_sort);
+ qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles);
}
rr_numa_list_populate(dp, &rr);
diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c
index faff842b2..8f22264b3 100644
--- a/lib/netdev-dpdk.c
+++ b/lib/netdev-dpdk.c
@@ -26,6 +26,7 @@
#include <sys/socket.h>
#include <linux/if.h>
+#include <rte_bus_pci.h>
#include <rte_config.h>
#include <rte_cycles.h>
#include <rte_errno.h>
@@ -36,6 +37,7 @@
#include <rte_meter.h>
#include <rte_pci.h>
#include <rte_vhost.h>
+#include <rte_version.h>
#include "dirs.h"
#include "dp-packet.h"
@@ -140,8 +142,8 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
#define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS
-/* DPDK library uses uint8_t for port_id. */
-typedef uint8_t dpdk_port_t;
+/* DPDK library uses uint16_t for port_id. */
+typedef uint16_t dpdk_port_t;
#define VHOST_ENQ_RETRY_NUM 8
#define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ)
@@ -330,6 +332,23 @@ enum dpdk_hw_ol_features {
NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0,
};
+/*
+ * In order to avoid confusion in variables names, following naming convention
+ * should be used, if possible:
+ *
+ * 'struct netdev' : 'netdev'
+ * 'struct netdev_dpdk' : 'dev'
+ * 'struct netdev_rxq' : 'rxq'
+ * 'struct netdev_rxq_dpdk' : 'rx'
+ *
+ * Example:
+ * struct netdev *netdev = netdev_from_name(name);
+ * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
+ *
+ * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was
+ * already defined.
+ */
+
struct netdev_dpdk {
PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0,
dpdk_port_t port_id;
@@ -1191,8 +1210,7 @@ netdev_dpdk_process_devargs(struct netdev_dpdk *dev,
char *name = xmemdup0(devargs, strcspn(devargs, ","));
dpdk_port_t new_port_id = DPDK_ETH_PORT_ID_INVALID;
- if (!rte_eth_dev_count()
- || rte_eth_dev_get_port_by_name(name, &new_port_id)
+ if (rte_eth_dev_get_port_by_name(name, &new_port_id)
|| !rte_eth_dev_is_valid_port(new_port_id)) {
/* Device not found in DPDK, attempt to attach it */
if (!rte_eth_dev_attach(devargs, &new_port_id)) {
@@ -2446,6 +2464,14 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args)
smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs);
smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools);
+ /* Querying the DPDK library for iftype may be done in future, pending
+ * support; cf. RFC 3635 Section 3.2.4. */
+ enum { IF_TYPE_ETHERNETCSMACD = 6 };
+
+ smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD);
+ smap_add_format(args, "if_descr", "%s %s", rte_version(),
+ dev_info.driver_name);
+
if (dev_info.pci_dev) {
smap_add_format(args, "pci-vendor_id", "0x%u",
dev_info.pci_dev->id.vendor_id);
@@ -2486,12 +2512,13 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
if (argc > 2) {
struct netdev *netdev = netdev_from_name(argv[1]);
+
if (netdev && is_dpdk_class(netdev->netdev_class)) {
- struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev);
+ struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
- ovs_mutex_lock(&dpdk_dev->mutex);
- netdev_dpdk_set_admin_state__(dpdk_dev, up);
- ovs_mutex_unlock(&dpdk_dev->mutex);
+ ovs_mutex_lock(&dev->mutex);
+ netdev_dpdk_set_admin_state__(dev, up);
+ ovs_mutex_unlock(&dev->mutex);
netdev_close(netdev);
} else {
@@ -2500,13 +2527,13 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc,
return;
}
} else {
- struct netdev_dpdk *netdev;
+ struct netdev_dpdk *dev;
ovs_mutex_lock(&dpdk_mutex);
- LIST_FOR_EACH (netdev, list_node, &dpdk_list) {
- ovs_mutex_lock(&netdev->mutex);
- netdev_dpdk_set_admin_state__(netdev, up);
- ovs_mutex_unlock(&netdev->mutex);
+ LIST_FOR_EACH (dev, list_node, &dpdk_list) {
+ ovs_mutex_lock(&dev->mutex);
+ netdev_dpdk_set_admin_state__(dev, up);
+ ovs_mutex_unlock(&dev->mutex);
}
ovs_mutex_unlock(&dpdk_mutex);
}
@@ -2525,8 +2552,7 @@ netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED,
ovs_mutex_lock(&dpdk_mutex);
- if (!rte_eth_dev_count() || rte_eth_dev_get_port_by_name(argv[1],
- &port_id)) {
+ if (rte_eth_dev_get_port_by_name(argv[1], &port_id)) {
response = xasprintf("Device '%s' not found in DPDK", argv[1]);
goto error;
}
@@ -3252,6 +3278,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
{
struct netdev_dpdk *dev = netdev_dpdk_cast(netdev);
int err;
+ uint64_t vhost_flags = 0;
ovs_mutex_lock(&dev->mutex);
@@ -3262,16 +3289,21 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev)
*/
if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT)
&& strlen(dev->vhost_id)) {
- /* Register client-mode device */
- err = rte_vhost_driver_register(dev->vhost_id,
- RTE_VHOST_USER_CLIENT);
+ /* Register client-mode device. */
+ vhost_flags |= RTE_VHOST_USER_CLIENT;
+
+ /* Enable IOMMU support, if explicitly requested. */
+ if (dpdk_vhost_iommu_enabled()) {
+ vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT;
+ }
+ err = rte_vhost_driver_register(dev->vhost_id, vhost_flags);
if (err) {
VLOG_ERR("vhost-user device setup failure for device %s\n",
dev->vhost_id);
goto unlock;
} else {
/* Configuration successful */
- dev->vhost_driver_flags |= RTE_VHOST_USER_CLIENT;
+ dev->vhost_driver_flags |= vhost_flags;
VLOG_INFO("vHost User device '%s' created in 'client' mode, "
"using client socket '%s'",
dev->up.name, dev->vhost_id);