diff options
author | Ben Pfaff <blp@ovn.org> | 2017-12-11 10:24:09 -0800 |
---|---|---|
committer | Ben Pfaff <blp@ovn.org> | 2017-12-11 10:24:09 -0800 |
commit | 8c087cecffb0355680ffcc2ac2debc41ac163ae6 (patch) | |
tree | b782feb589a8066d018af2dca27ad803045f9959 /lib | |
parent | 433695320a9ea08f1287e9de0c4eabb29a88483e (diff) | |
parent | 3eb8d4fa0db3159a8ffc8f52223417b3417263b3 (diff) | |
download | openvswitch-8c087cecffb0355680ffcc2ac2debc41ac163ae6.tar.gz |
Merge branch 'dpdk_merge' of https://github.com/istokes/ovs into HEAD
Diffstat (limited to 'lib')
-rw-r--r-- | lib/dpdk-stub.c | 6 | ||||
-rw-r--r-- | lib/dpdk.c | 12 | ||||
-rw-r--r-- | lib/dpdk.h | 3 | ||||
-rw-r--r-- | lib/dpif-netdev.c | 202 | ||||
-rw-r--r-- | lib/netdev-dpdk.c | 70 |
5 files changed, 169 insertions, 124 deletions
diff --git a/lib/dpdk-stub.c b/lib/dpdk-stub.c index daef7291f..36021807c 100644 --- a/lib/dpdk-stub.c +++ b/lib/dpdk-stub.c @@ -48,3 +48,9 @@ dpdk_get_vhost_sock_dir(void) { return NULL; } + +bool +dpdk_vhost_iommu_enabled(void) +{ + return false; +} diff --git a/lib/dpdk.c b/lib/dpdk.c index 8da6c3244..6710d10fc 100644 --- a/lib/dpdk.c +++ b/lib/dpdk.c @@ -41,6 +41,7 @@ VLOG_DEFINE_THIS_MODULE(dpdk); static FILE *log_stream = NULL; /* Stream for DPDK log redirection */ static char *vhost_sock_dir = NULL; /* Location of vhost-user sockets */ +static bool vhost_iommu_enabled = false; /* Status of vHost IOMMU support */ static int process_vhost_flags(char *flag, const char *default_val, int size, @@ -345,6 +346,11 @@ dpdk_init__(const struct smap *ovs_other_config) vhost_sock_dir = sock_dir_subcomponent; } + vhost_iommu_enabled = smap_get_bool(ovs_other_config, + "vhost-iommu-support", false); + VLOG_INFO("IOMMU support for vhost-user-client %s.", + vhost_iommu_enabled ? "enabled" : "disabled"); + argv = grow_argv(&argv, 0, 1); argc = 1; argv[0] = xstrdup(ovs_get_program_name()); @@ -482,6 +488,12 @@ dpdk_get_vhost_sock_dir(void) return vhost_sock_dir; } +bool +dpdk_vhost_iommu_enabled(void) +{ + return vhost_iommu_enabled; +} + void dpdk_set_lcore_id(unsigned cpu) { diff --git a/lib/dpdk.h b/lib/dpdk.h index 673a1f17e..dc58d968a 100644 --- a/lib/dpdk.h +++ b/lib/dpdk.h @@ -17,6 +17,8 @@ #ifndef DPDK_H #define DPDK_H +#include <stdbool.h> + #ifdef DPDK_NETDEV #include <rte_config.h> @@ -35,5 +37,6 @@ struct smap; void dpdk_init(const struct smap *ovs_other_config); void dpdk_set_lcore_id(unsigned cpu); const char *dpdk_get_vhost_sock_dir(void); +bool dpdk_vhost_iommu_enabled(void); #endif /* dpdk.h */ diff --git a/lib/dpif-netdev.c b/lib/dpif-netdev.c index b1ef9a6a5..43f6a7857 100644 --- a/lib/dpif-netdev.c +++ b/lib/dpif-netdev.c @@ -547,31 +547,18 @@ struct tx_port { * actions in either case. * */ struct dp_netdev_pmd_thread { - PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0, - struct dp_netdev *dp; - struct cmap_node node; /* In 'dp->poll_threads'. */ - pthread_cond_t cond; /* For synchronizing pmd thread - reload. */ - ); - - PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline1, - struct ovs_mutex cond_mutex; /* Mutex for condition variable. */ - pthread_t thread; - unsigned core_id; /* CPU core id of this pmd thread. */ - int numa_id; /* numa node id of this pmd thread. */ - ); + struct dp_netdev *dp; + struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */ + struct cmap_node node; /* In 'dp->poll_threads'. */ + + pthread_cond_t cond; /* For synchronizing pmd thread reload. */ + struct ovs_mutex cond_mutex; /* Mutex for condition variable. */ /* Per thread exact-match cache. Note, the instance for cpu core * NON_PMD_CORE_ID can be accessed by multiple threads, and thusly * need to be protected by 'non_pmd_mutex'. Every other instance * will only be accessed by its own pmd thread. */ - OVS_ALIGNED_VAR(CACHE_LINE_SIZE) struct emc_cache flow_cache; - struct ovs_refcount ref_cnt; /* Every reference must be refcount'ed. */ - - /* Queue id used by this pmd thread to send packets on all netdevs if - * XPS disabled for this netdev. All static_tx_qid's are unique and less - * than 'cmap_count(dp->poll_threads)'. */ - uint32_t static_tx_qid; + struct emc_cache flow_cache; /* Flow-Table and classifiers * @@ -580,77 +567,68 @@ struct dp_netdev_pmd_thread { * 'flow_mutex'. */ struct ovs_mutex flow_mutex; - PADDED_MEMBERS(CACHE_LINE_SIZE, - struct cmap flow_table OVS_GUARDED; /* Flow table. */ - - /* One classifier per in_port polled by the pmd */ - struct cmap classifiers; - /* Periodically sort subtable vectors according to hit frequencies */ - long long int next_optimization; - /* End of the next time interval for which processing cycles - are stored for each polled rxq. */ - long long int rxq_next_cycle_store; - - /* Cycles counters */ - struct dp_netdev_pmd_cycles cycles; - - /* Used to count cycles. See 'cycles_counter_end()'. */ - unsigned long long last_cycles; - struct latch exit_latch; /* For terminating the pmd thread. */ - ); - - PADDED_MEMBERS(CACHE_LINE_SIZE, - /* Statistics. */ - struct dp_netdev_pmd_stats stats; - - struct seq *reload_seq; - uint64_t last_reload_seq; - atomic_bool reload; /* Do we need to reload ports? */ - bool isolated; - - /* Set to true if the pmd thread needs to be reloaded. */ - bool need_reload; - /* 5 pad bytes. */ - ); - - PADDED_MEMBERS(CACHE_LINE_SIZE, - struct ovs_mutex port_mutex; /* Mutex for 'poll_list' - and 'tx_ports'. */ - /* 16 pad bytes. */ - ); - PADDED_MEMBERS(CACHE_LINE_SIZE, - /* List of rx queues to poll. */ - struct hmap poll_list OVS_GUARDED; - /* Map of 'tx_port's used for transmission. Written by the main - * thread, read by the pmd thread. */ - struct hmap tx_ports OVS_GUARDED; - ); - PADDED_MEMBERS(CACHE_LINE_SIZE, - /* These are thread-local copies of 'tx_ports'. One contains only - * tunnel ports (that support push_tunnel/pop_tunnel), the other - * contains ports with at least one txq (that support send). - * A port can be in both. - * - * There are two separate maps to make sure that we don't try to - * execute OUTPUT on a device which has 0 txqs or PUSH/POP on a - * non-tunnel device. - * - * The instances for cpu core NON_PMD_CORE_ID can be accessed by - * multiple threads and thusly need to be protected by 'non_pmd_mutex'. - * Every other instance will only be accessed by its own pmd thread. */ - struct hmap tnl_port_cache; - struct hmap send_port_cache; - ); - - PADDED_MEMBERS(CACHE_LINE_SIZE, - /* Only a pmd thread can write on its own 'cycles' and 'stats'. - * The main thread keeps 'stats_zero' and 'cycles_zero' as base - * values and subtracts them from 'stats' and 'cycles' before - * reporting to the user */ - unsigned long long stats_zero[DP_N_STATS]; - uint64_t cycles_zero[PMD_N_CYCLES]; - /* 8 pad bytes. */ - ); + struct cmap flow_table OVS_GUARDED; /* Flow table. */ + + /* One classifier per in_port polled by the pmd */ + struct cmap classifiers; + /* Periodically sort subtable vectors according to hit frequencies */ + long long int next_optimization; + /* End of the next time interval for which processing cycles + are stored for each polled rxq. */ + long long int rxq_next_cycle_store; + + /* Statistics. */ + struct dp_netdev_pmd_stats stats; + + /* Cycles counters */ + struct dp_netdev_pmd_cycles cycles; + + /* Used to count cicles. See 'cycles_counter_end()' */ + unsigned long long last_cycles; + + struct latch exit_latch; /* For terminating the pmd thread. */ + struct seq *reload_seq; + uint64_t last_reload_seq; + atomic_bool reload; /* Do we need to reload ports? */ + pthread_t thread; + unsigned core_id; /* CPU core id of this pmd thread. */ + int numa_id; /* numa node id of this pmd thread. */ + bool isolated; + + /* Queue id used by this pmd thread to send packets on all netdevs if + * XPS disabled for this netdev. All static_tx_qid's are unique and less + * than 'cmap_count(dp->poll_threads)'. */ + uint32_t static_tx_qid; + + struct ovs_mutex port_mutex; /* Mutex for 'poll_list' and 'tx_ports'. */ + /* List of rx queues to poll. */ + struct hmap poll_list OVS_GUARDED; + /* Map of 'tx_port's used for transmission. Written by the main thread, + * read by the pmd thread. */ + struct hmap tx_ports OVS_GUARDED; + + /* These are thread-local copies of 'tx_ports'. One contains only tunnel + * ports (that support push_tunnel/pop_tunnel), the other contains ports + * with at least one txq (that support send). A port can be in both. + * + * There are two separate maps to make sure that we don't try to execute + * OUTPUT on a device which has 0 txqs or PUSH/POP on a non-tunnel device. + * + * The instances for cpu core NON_PMD_CORE_ID can be accessed by multiple + * threads, and thusly need to be protected by 'non_pmd_mutex'. Every + * other instance will only be accessed by its own pmd thread. */ + struct hmap tnl_port_cache; + struct hmap send_port_cache; + + /* Only a pmd thread can write on its own 'cycles' and 'stats'. + * The main thread keeps 'stats_zero' and 'cycles_zero' as base + * values and subtracts them from 'stats' and 'cycles' before + * reporting to the user */ + unsigned long long stats_zero[DP_N_STATS]; + uint64_t cycles_zero[PMD_N_CYCLES]; + + /* Set to true if the pmd thread needs to be reloaded. */ + bool need_reload; }; /* Interface to netdev-based datapath. */ @@ -2925,6 +2903,9 @@ dpif_netdev_execute(struct dpif *dpif, struct dpif_execute *execute) /* If this is part of a probe, Drop the packet, since executing * the action may actually cause spurious packets be sent into * the network. */ + if (pmd->core_id == NON_PMD_CORE_ID) { + dp_netdev_pmd_unref(pmd); + } return 0; } @@ -3445,28 +3426,31 @@ rr_numa_list_destroy(struct rr_numa_list *rr) /* Sort Rx Queues by the processing cycles they are consuming. */ static int -rxq_cycle_sort(const void *a, const void *b) +compare_rxq_cycles(const void *a, const void *b) { struct dp_netdev_rxq *qa; struct dp_netdev_rxq *qb; - uint64_t total_qa, total_qb; - unsigned i; + uint64_t cycles_qa, cycles_qb; qa = *(struct dp_netdev_rxq **) a; qb = *(struct dp_netdev_rxq **) b; - total_qa = total_qb = 0; - for (i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) { - total_qa += dp_netdev_rxq_get_intrvl_cycles(qa, i); - total_qb += dp_netdev_rxq_get_intrvl_cycles(qb, i); - } - dp_netdev_rxq_set_cycles(qa, RXQ_CYCLES_PROC_HIST, total_qa); - dp_netdev_rxq_set_cycles(qb, RXQ_CYCLES_PROC_HIST, total_qb); + cycles_qa = dp_netdev_rxq_get_cycles(qa, RXQ_CYCLES_PROC_HIST); + cycles_qb = dp_netdev_rxq_get_cycles(qb, RXQ_CYCLES_PROC_HIST); - if (total_qa >= total_qb) { - return -1; + if (cycles_qa != cycles_qb) { + return (cycles_qa < cycles_qb) ? 1 : -1; + } else { + /* Cycles are the same so tiebreak on port/queue id. + * Tiebreaking (as opposed to return 0) ensures consistent + * sort results across multiple OS's. */ + if (qa->port->port_no != qb->port->port_no) { + return (qa->port->port_no > qb->port->port_no) ? 1 : -1; + } else { + return netdev_rxq_get_queue_id(qa->rx) + - netdev_rxq_get_queue_id(qb->rx); + } } - return 1; } /* Assign pmds to queues. If 'pinned' is true, assign pmds to pinned @@ -3511,11 +3495,19 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) dp_netdev_pmd_unref(pmd); } } else if (!pinned && q->core_id == OVS_CORE_UNSPEC) { + uint64_t cycle_hist = 0; + if (n_rxqs == 0) { rxqs = xmalloc(sizeof *rxqs); } else { rxqs = xrealloc(rxqs, sizeof *rxqs * (n_rxqs + 1)); } + /* Sum the queue intervals and store the cycle history. */ + for (unsigned i = 0; i < PMD_RXQ_INTERVAL_MAX; i++) { + cycle_hist += dp_netdev_rxq_get_intrvl_cycles(q, i); + } + dp_netdev_rxq_set_cycles(q, RXQ_CYCLES_PROC_HIST, cycle_hist); + /* Store the queue. */ rxqs[n_rxqs++] = q; } @@ -3525,7 +3517,7 @@ rxq_scheduling(struct dp_netdev *dp, bool pinned) OVS_REQUIRES(dp->port_mutex) if (n_rxqs > 1) { /* Sort the queues in order of the processing cycles * they consumed during their last pmd interval. */ - qsort(rxqs, n_rxqs, sizeof *rxqs, rxq_cycle_sort); + qsort(rxqs, n_rxqs, sizeof *rxqs, compare_rxq_cycles); } rr_numa_list_populate(dp, &rr); diff --git a/lib/netdev-dpdk.c b/lib/netdev-dpdk.c index faff842b2..8f22264b3 100644 --- a/lib/netdev-dpdk.c +++ b/lib/netdev-dpdk.c @@ -26,6 +26,7 @@ #include <sys/socket.h> #include <linux/if.h> +#include <rte_bus_pci.h> #include <rte_config.h> #include <rte_cycles.h> #include <rte_errno.h> @@ -36,6 +37,7 @@ #include <rte_meter.h> #include <rte_pci.h> #include <rte_vhost.h> +#include <rte_version.h> #include "dirs.h" #include "dp-packet.h" @@ -140,8 +142,8 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20); #define DPDK_ETH_PORT_ID_INVALID RTE_MAX_ETHPORTS -/* DPDK library uses uint8_t for port_id. */ -typedef uint8_t dpdk_port_t; +/* DPDK library uses uint16_t for port_id. */ +typedef uint16_t dpdk_port_t; #define VHOST_ENQ_RETRY_NUM 8 #define IF_NAME_SZ (PATH_MAX > IFNAMSIZ ? PATH_MAX : IFNAMSIZ) @@ -330,6 +332,23 @@ enum dpdk_hw_ol_features { NETDEV_RX_CHECKSUM_OFFLOAD = 1 << 0, }; +/* + * In order to avoid confusion in variables names, following naming convention + * should be used, if possible: + * + * 'struct netdev' : 'netdev' + * 'struct netdev_dpdk' : 'dev' + * 'struct netdev_rxq' : 'rxq' + * 'struct netdev_rxq_dpdk' : 'rx' + * + * Example: + * struct netdev *netdev = netdev_from_name(name); + * struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); + * + * Also, 'netdev' should be used instead of 'dev->up', where 'netdev' was + * already defined. + */ + struct netdev_dpdk { PADDED_MEMBERS_CACHELINE_MARKER(CACHE_LINE_SIZE, cacheline0, dpdk_port_t port_id; @@ -1191,8 +1210,7 @@ netdev_dpdk_process_devargs(struct netdev_dpdk *dev, char *name = xmemdup0(devargs, strcspn(devargs, ",")); dpdk_port_t new_port_id = DPDK_ETH_PORT_ID_INVALID; - if (!rte_eth_dev_count() - || rte_eth_dev_get_port_by_name(name, &new_port_id) + if (rte_eth_dev_get_port_by_name(name, &new_port_id) || !rte_eth_dev_is_valid_port(new_port_id)) { /* Device not found in DPDK, attempt to attach it */ if (!rte_eth_dev_attach(devargs, &new_port_id)) { @@ -2446,6 +2464,14 @@ netdev_dpdk_get_status(const struct netdev *netdev, struct smap *args) smap_add_format(args, "max_vfs", "%u", dev_info.max_vfs); smap_add_format(args, "max_vmdq_pools", "%u", dev_info.max_vmdq_pools); + /* Querying the DPDK library for iftype may be done in future, pending + * support; cf. RFC 3635 Section 3.2.4. */ + enum { IF_TYPE_ETHERNETCSMACD = 6 }; + + smap_add_format(args, "if_type", "%"PRIu32, IF_TYPE_ETHERNETCSMACD); + smap_add_format(args, "if_descr", "%s %s", rte_version(), + dev_info.driver_name); + if (dev_info.pci_dev) { smap_add_format(args, "pci-vendor_id", "0x%u", dev_info.pci_dev->id.vendor_id); @@ -2486,12 +2512,13 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc, if (argc > 2) { struct netdev *netdev = netdev_from_name(argv[1]); + if (netdev && is_dpdk_class(netdev->netdev_class)) { - struct netdev_dpdk *dpdk_dev = netdev_dpdk_cast(netdev); + struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); - ovs_mutex_lock(&dpdk_dev->mutex); - netdev_dpdk_set_admin_state__(dpdk_dev, up); - ovs_mutex_unlock(&dpdk_dev->mutex); + ovs_mutex_lock(&dev->mutex); + netdev_dpdk_set_admin_state__(dev, up); + ovs_mutex_unlock(&dev->mutex); netdev_close(netdev); } else { @@ -2500,13 +2527,13 @@ netdev_dpdk_set_admin_state(struct unixctl_conn *conn, int argc, return; } } else { - struct netdev_dpdk *netdev; + struct netdev_dpdk *dev; ovs_mutex_lock(&dpdk_mutex); - LIST_FOR_EACH (netdev, list_node, &dpdk_list) { - ovs_mutex_lock(&netdev->mutex); - netdev_dpdk_set_admin_state__(netdev, up); - ovs_mutex_unlock(&netdev->mutex); + LIST_FOR_EACH (dev, list_node, &dpdk_list) { + ovs_mutex_lock(&dev->mutex); + netdev_dpdk_set_admin_state__(dev, up); + ovs_mutex_unlock(&dev->mutex); } ovs_mutex_unlock(&dpdk_mutex); } @@ -2525,8 +2552,7 @@ netdev_dpdk_detach(struct unixctl_conn *conn, int argc OVS_UNUSED, ovs_mutex_lock(&dpdk_mutex); - if (!rte_eth_dev_count() || rte_eth_dev_get_port_by_name(argv[1], - &port_id)) { + if (rte_eth_dev_get_port_by_name(argv[1], &port_id)) { response = xasprintf("Device '%s' not found in DPDK", argv[1]); goto error; } @@ -3252,6 +3278,7 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) { struct netdev_dpdk *dev = netdev_dpdk_cast(netdev); int err; + uint64_t vhost_flags = 0; ovs_mutex_lock(&dev->mutex); @@ -3262,16 +3289,21 @@ netdev_dpdk_vhost_client_reconfigure(struct netdev *netdev) */ if (!(dev->vhost_driver_flags & RTE_VHOST_USER_CLIENT) && strlen(dev->vhost_id)) { - /* Register client-mode device */ - err = rte_vhost_driver_register(dev->vhost_id, - RTE_VHOST_USER_CLIENT); + /* Register client-mode device. */ + vhost_flags |= RTE_VHOST_USER_CLIENT; + + /* Enable IOMMU support, if explicitly requested. */ + if (dpdk_vhost_iommu_enabled()) { + vhost_flags |= RTE_VHOST_USER_IOMMU_SUPPORT; + } + err = rte_vhost_driver_register(dev->vhost_id, vhost_flags); if (err) { VLOG_ERR("vhost-user device setup failure for device %s\n", dev->vhost_id); goto unlock; } else { /* Configuration successful */ - dev->vhost_driver_flags |= RTE_VHOST_USER_CLIENT; + dev->vhost_driver_flags |= vhost_flags; VLOG_INFO("vHost User device '%s' created in 'client' mode, " "using client socket '%s'", dev->up.name, dev->vhost_id); |