summaryrefslogtreecommitdiff
path: root/lib
diff options
context:
space:
mode:
authorWilliam Tu <u9012063@gmail.com>2019-07-18 13:11:14 -0700
committerIlya Maximets <i.maximets@samsung.com>2019-07-19 17:42:06 +0300
commit0de1b425962db073ebbaa3ddbde445580afda840 (patch)
tree583c01f90fcb0422e2ea500bbe20c945cb81f12e /lib
parent884ca8aceb0cbb5dc8ca3d124a59157ef2b941cf (diff)
downloadopenvswitch-0de1b425962db073ebbaa3ddbde445580afda840.tar.gz
netdev-afxdp: add new netdev type for AF_XDP.
The patch introduces experimental AF_XDP support for OVS netdev. AF_XDP, the Address Family of the eXpress Data Path, is a new Linux socket type built upon the eBPF and XDP technology. It is aims to have comparable performance to DPDK but cooperate better with existing kernel's networking stack. An AF_XDP socket receives and sends packets from an eBPF/XDP program attached to the netdev, by-passing a couple of Linux kernel's subsystems As a result, AF_XDP socket shows much better performance than AF_PACKET For more details about AF_XDP, please see linux kernel's Documentation/networking/af_xdp.rst. Note that by default, this feature is not compiled in. Signed-off-by: William Tu <u9012063@gmail.com> Signed-off-by: Ilya Maximets <i.maximets@samsung.com>
Diffstat (limited to 'lib')
-rw-r--r--lib/automake.mk10
-rw-r--r--lib/dp-packet.c23
-rw-r--r--lib/dp-packet.h18
-rw-r--r--lib/dpif-netdev-perf.h24
-rw-r--r--lib/netdev-afxdp-pool.c167
-rw-r--r--lib/netdev-afxdp-pool.h56
-rw-r--r--lib/netdev-afxdp.c1041
-rw-r--r--lib/netdev-afxdp.h71
-rw-r--r--lib/netdev-linux-private.h130
-rw-r--r--lib/netdev-linux.c126
-rw-r--r--lib/netdev-provider.h3
-rw-r--r--lib/netdev.c11
-rw-r--r--lib/util.c91
-rw-r--r--lib/util.h5
14 files changed, 1667 insertions, 109 deletions
diff --git a/lib/automake.mk b/lib/automake.mk
index 29d3458da..17b36b43d 100644
--- a/lib/automake.mk
+++ b/lib/automake.mk
@@ -9,6 +9,7 @@ lib_LTLIBRARIES += lib/libopenvswitch.la
lib_libopenvswitch_la_LIBADD = $(SSL_LIBS)
lib_libopenvswitch_la_LIBADD += $(CAPNG_LDADD)
+lib_libopenvswitch_la_LIBADD += $(LIBBPF_LDADD)
if WIN32
lib_libopenvswitch_la_LIBADD += ${PTHREAD_LIBS}
@@ -396,6 +397,7 @@ lib_libopenvswitch_la_SOURCES += \
lib/if-notifier.h \
lib/netdev-linux.c \
lib/netdev-linux.h \
+ lib/netdev-linux-private.h \
lib/netdev-offload-tc.c \
lib/netlink-conntrack.c \
lib/netlink-conntrack.h \
@@ -412,6 +414,14 @@ lib_libopenvswitch_la_SOURCES += \
lib/tc.h
endif
+if HAVE_AF_XDP
+lib_libopenvswitch_la_SOURCES += \
+ lib/netdev-afxdp-pool.c \
+ lib/netdev-afxdp-pool.h \
+ lib/netdev-afxdp.c \
+ lib/netdev-afxdp.h
+endif
+
if DPDK_NETDEV
lib_libopenvswitch_la_SOURCES += \
lib/dpdk.c \
diff --git a/lib/dp-packet.c b/lib/dp-packet.c
index 0976a35e7..62d7faa4c 100644
--- a/lib/dp-packet.c
+++ b/lib/dp-packet.c
@@ -19,6 +19,7 @@
#include <string.h>
#include "dp-packet.h"
+#include "netdev-afxdp.h"
#include "netdev-dpdk.h"
#include "openvswitch/dynamic-string.h"
#include "util.h"
@@ -59,6 +60,22 @@ dp_packet_use(struct dp_packet *b, void *base, size_t allocated)
dp_packet_use__(b, base, allocated, DPBUF_MALLOC);
}
+#if HAVE_AF_XDP
+/* Initialize 'b' as an empty dp_packet that contains
+ * memory starting at AF_XDP umem base.
+ */
+void
+dp_packet_use_afxdp(struct dp_packet *b, void *data, size_t allocated,
+ size_t headroom)
+{
+ dp_packet_set_base(b, (char *)data - headroom);
+ dp_packet_set_data(b, data);
+ dp_packet_set_size(b, 0);
+
+ dp_packet_init__(b, allocated, DPBUF_AFXDP);
+}
+#endif
+
/* Initializes 'b' as an empty dp_packet that contains the 'allocated' bytes of
* memory starting at 'base'. 'base' should point to a buffer on the stack.
* (Nothing actually relies on 'base' being allocated on the stack. It could
@@ -122,6 +139,8 @@ dp_packet_uninit(struct dp_packet *b)
* created as a dp_packet */
free_dpdk_buf((struct dp_packet*) b);
#endif
+ } else if (b->source == DPBUF_AFXDP) {
+ free_afxdp_buf(b);
}
}
}
@@ -248,6 +267,9 @@ dp_packet_resize__(struct dp_packet *b, size_t new_headroom, size_t new_tailroom
case DPBUF_STACK:
OVS_NOT_REACHED();
+ case DPBUF_AFXDP:
+ OVS_NOT_REACHED();
+
case DPBUF_STUB:
b->source = DPBUF_MALLOC;
new_base = xmalloc(new_allocated);
@@ -433,6 +455,7 @@ dp_packet_steal_data(struct dp_packet *b)
{
void *p;
ovs_assert(b->source != DPBUF_DPDK);
+ ovs_assert(b->source != DPBUF_AFXDP);
if (b->source == DPBUF_MALLOC && dp_packet_data(b) == dp_packet_base(b)) {
p = dp_packet_data(b);
diff --git a/lib/dp-packet.h b/lib/dp-packet.h
index a5e9ade12..14f0897fa 100644
--- a/lib/dp-packet.h
+++ b/lib/dp-packet.h
@@ -25,6 +25,7 @@
#include <rte_mbuf.h>
#endif
+#include "netdev-afxdp.h"
#include "netdev-dpdk.h"
#include "openvswitch/list.h"
#include "packets.h"
@@ -42,6 +43,7 @@ enum OVS_PACKED_ENUM dp_packet_source {
DPBUF_DPDK, /* buffer data is from DPDK allocated memory.
* ref to dp_packet_init_dpdk() in dp-packet.c.
*/
+ DPBUF_AFXDP, /* Buffer data from XDP frame. */
};
#define DP_PACKET_CONTEXT_SIZE 64
@@ -89,6 +91,13 @@ struct dp_packet {
};
};
+#if HAVE_AF_XDP
+struct dp_packet_afxdp {
+ struct umem_pool *mpool;
+ struct dp_packet packet;
+};
+#endif
+
static inline void *dp_packet_data(const struct dp_packet *);
static inline void dp_packet_set_data(struct dp_packet *, void *);
static inline void *dp_packet_base(const struct dp_packet *);
@@ -122,7 +131,9 @@ static inline const void *dp_packet_get_nd_payload(const struct dp_packet *);
void dp_packet_use(struct dp_packet *, void *, size_t);
void dp_packet_use_stub(struct dp_packet *, void *, size_t);
void dp_packet_use_const(struct dp_packet *, const void *, size_t);
-
+#if HAVE_AF_XDP
+void dp_packet_use_afxdp(struct dp_packet *, void *, size_t, size_t);
+#endif
void dp_packet_init_dpdk(struct dp_packet *);
void dp_packet_init(struct dp_packet *, size_t);
@@ -184,6 +195,11 @@ dp_packet_delete(struct dp_packet *b)
return;
}
+ if (b->source == DPBUF_AFXDP) {
+ free_afxdp_buf(b);
+ return;
+ }
+
dp_packet_uninit(b);
free(b);
}
diff --git a/lib/dpif-netdev-perf.h b/lib/dpif-netdev-perf.h
index 859c05613..244813ffe 100644
--- a/lib/dpif-netdev-perf.h
+++ b/lib/dpif-netdev-perf.h
@@ -21,6 +21,7 @@
#include <stddef.h>
#include <stdint.h>
#include <string.h>
+#include <time.h>
#include <math.h>
#ifdef DPDK_NETDEV
@@ -186,6 +187,22 @@ struct pmd_perf_stats {
char *log_reason;
};
+#ifdef __linux__
+static inline uint64_t
+rdtsc_syscall(struct pmd_perf_stats *s)
+{
+ struct timespec val;
+ uint64_t v;
+
+ if (clock_gettime(CLOCK_MONOTONIC_RAW, &val) != 0) {
+ return s->last_tsc;
+ }
+
+ v = val.tv_sec * UINT64_C(1000000000) + val.tv_nsec;
+ return s->last_tsc = v;
+}
+#endif
+
/* Support for accurate timing of PMD execution on TSC clock cycle level.
* These functions are intended to be invoked in the context of pmd threads. */
@@ -198,6 +215,13 @@ cycles_counter_update(struct pmd_perf_stats *s)
{
#ifdef DPDK_NETDEV
return s->last_tsc = rte_get_tsc_cycles();
+#elif !defined(_MSC_VER) && defined(__x86_64__)
+ uint32_t h, l;
+ asm volatile("rdtsc" : "=a" (l), "=d" (h));
+
+ return s->last_tsc = ((uint64_t) h << 32) | l;
+#elif defined(__linux__)
+ return rdtsc_syscall(s);
#else
return s->last_tsc = 0;
#endif
diff --git a/lib/netdev-afxdp-pool.c b/lib/netdev-afxdp-pool.c
new file mode 100644
index 000000000..3386d2dcf
--- /dev/null
+++ b/lib/netdev-afxdp-pool.c
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2018, 2019 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include <config.h>
+
+#include "dp-packet.h"
+#include "netdev-afxdp-pool.h"
+#include "openvswitch/util.h"
+
+/* Note:
+ * umem_elem_push* shouldn't overflow because we always pop
+ * elem first, then push back to the stack.
+ */
+static inline void
+umem_elem_push_n__(struct umem_pool *umemp, int n, void **addrs)
+{
+ void *ptr;
+
+ ovs_assert(umemp->index + n <= umemp->size);
+
+ ptr = &umemp->array[umemp->index];
+ memcpy(ptr, addrs, n * sizeof(void *));
+ umemp->index += n;
+}
+
+void umem_elem_push_n(struct umem_pool *umemp, int n, void **addrs)
+{
+ ovs_spin_lock(&umemp->lock);
+ umem_elem_push_n__(umemp, n, addrs);
+ ovs_spin_unlock(&umemp->lock);
+}
+
+static inline void
+umem_elem_push__(struct umem_pool *umemp, void *addr)
+{
+ ovs_assert(umemp->index + 1 <= umemp->size);
+
+ umemp->array[umemp->index++] = addr;
+}
+
+void
+umem_elem_push(struct umem_pool *umemp, void *addr)
+{
+ ovs_spin_lock(&umemp->lock);
+ umem_elem_push__(umemp, addr);
+ ovs_spin_unlock(&umemp->lock);
+}
+
+static inline int
+umem_elem_pop_n__(struct umem_pool *umemp, int n, void **addrs)
+{
+ void *ptr;
+
+ if (OVS_UNLIKELY(umemp->index - n < 0)) {
+ return -ENOMEM;
+ }
+
+ umemp->index -= n;
+ ptr = &umemp->array[umemp->index];
+ memcpy(addrs, ptr, n * sizeof(void *));
+
+ return 0;
+}
+
+int
+umem_elem_pop_n(struct umem_pool *umemp, int n, void **addrs)
+{
+ int ret;
+
+ ovs_spin_lock(&umemp->lock);
+ ret = umem_elem_pop_n__(umemp, n, addrs);
+ ovs_spin_unlock(&umemp->lock);
+
+ return ret;
+}
+
+static inline void *
+umem_elem_pop__(struct umem_pool *umemp)
+{
+ if (OVS_UNLIKELY(umemp->index - 1 < 0)) {
+ return NULL;
+ }
+
+ return umemp->array[--umemp->index];
+}
+
+void *
+umem_elem_pop(struct umem_pool *umemp)
+{
+ void *ptr;
+
+ ovs_spin_lock(&umemp->lock);
+ ptr = umem_elem_pop__(umemp);
+ ovs_spin_unlock(&umemp->lock);
+
+ return ptr;
+}
+
+static void **
+umem_pool_alloc__(unsigned int size)
+{
+ void **bufs;
+
+ bufs = xmalloc_pagealign(size * sizeof *bufs);
+ memset(bufs, 0, size * sizeof *bufs);
+
+ return bufs;
+}
+
+int
+umem_pool_init(struct umem_pool *umemp, unsigned int size)
+{
+ umemp->array = umem_pool_alloc__(size);
+ if (!umemp->array) {
+ return -ENOMEM;
+ }
+
+ umemp->size = size;
+ umemp->index = 0;
+ ovs_spin_init(&umemp->lock);
+ return 0;
+}
+
+void
+umem_pool_cleanup(struct umem_pool *umemp)
+{
+ free_pagealign(umemp->array);
+ umemp->array = NULL;
+ ovs_spin_destroy(&umemp->lock);
+}
+
+unsigned int
+umem_pool_count(struct umem_pool *umemp)
+{
+ return umemp->index;
+}
+
+/* AF_XDP metadata init/destroy. */
+int
+xpacket_pool_init(struct xpacket_pool *xp, unsigned int size)
+{
+ xp->array = xmalloc_pagealign(size * sizeof *xp->array);
+ xp->size = size;
+
+ memset(xp->array, 0, size * sizeof *xp->array);
+
+ return 0;
+}
+
+void
+xpacket_pool_cleanup(struct xpacket_pool *xp)
+{
+ free_pagealign(xp->array);
+ xp->array = NULL;
+}
diff --git a/lib/netdev-afxdp-pool.h b/lib/netdev-afxdp-pool.h
new file mode 100644
index 000000000..f929b9489
--- /dev/null
+++ b/lib/netdev-afxdp-pool.h
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2018, 2019 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef XDPSOCK_H
+#define XDPSOCK_H 1
+
+#ifdef HAVE_AF_XDP
+
+#include <bpf/xsk.h>
+#include <errno.h>
+#include <stdbool.h>
+
+#include "openvswitch/thread.h"
+#include "ovs-atomic.h"
+
+/* LIFO ptr_array. */
+struct umem_pool {
+ int index; /* Point to top. */
+ unsigned int size;
+ struct ovs_spin lock;
+ void **array; /* A pointer array pointing to umem buf. */
+};
+
+/* Array-based dp_packet_afxdp. */
+struct xpacket_pool {
+ unsigned int size;
+ struct dp_packet_afxdp *array;
+};
+
+void umem_elem_push(struct umem_pool *umemp, void *addr);
+void umem_elem_push_n(struct umem_pool *umemp, int n, void **addrs);
+
+void *umem_elem_pop(struct umem_pool *umemp);
+int umem_elem_pop_n(struct umem_pool *umemp, int n, void **addrs);
+
+int umem_pool_init(struct umem_pool *umemp, unsigned int size);
+void umem_pool_cleanup(struct umem_pool *umemp);
+unsigned int umem_pool_count(struct umem_pool *umemp);
+int xpacket_pool_init(struct xpacket_pool *xp, unsigned int size);
+void xpacket_pool_cleanup(struct xpacket_pool *xp);
+
+#endif
+#endif
diff --git a/lib/netdev-afxdp.c b/lib/netdev-afxdp.c
new file mode 100644
index 000000000..80f13fff3
--- /dev/null
+++ b/lib/netdev-afxdp.c
@@ -0,0 +1,1041 @@
+/*
+ * Copyright (c) 2018, 2019 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <config.h>
+
+#include "netdev-linux-private.h"
+#include "netdev-linux.h"
+#include "netdev-afxdp.h"
+#include "netdev-afxdp-pool.h"
+
+#include <errno.h>
+#include <inttypes.h>
+#include <linux/rtnetlink.h>
+#include <linux/if_xdp.h>
+#include <net/if.h>
+#include <stdlib.h>
+#include <sys/resource.h>
+#include <sys/socket.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include "coverage.h"
+#include "dp-packet.h"
+#include "dpif-netdev.h"
+#include "fatal-signal.h"
+#include "openvswitch/compiler.h"
+#include "openvswitch/dynamic-string.h"
+#include "openvswitch/list.h"
+#include "openvswitch/vlog.h"
+#include "packets.h"
+#include "socket-util.h"
+#include "util.h"
+
+#ifndef SOL_XDP
+#define SOL_XDP 283
+#endif
+
+COVERAGE_DEFINE(afxdp_cq_empty);
+COVERAGE_DEFINE(afxdp_fq_full);
+COVERAGE_DEFINE(afxdp_tx_full);
+COVERAGE_DEFINE(afxdp_cq_skip);
+
+VLOG_DEFINE_THIS_MODULE(netdev_afxdp);
+
+static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
+
+#define MAX_XSKQ 16
+#define FRAME_HEADROOM XDP_PACKET_HEADROOM
+#define OVS_XDP_HEADROOM 128
+#define FRAME_SIZE XSK_UMEM__DEFAULT_FRAME_SIZE
+#define FRAME_SHIFT XSK_UMEM__DEFAULT_FRAME_SHIFT
+#define FRAME_SHIFT_MASK ((1 << FRAME_SHIFT) - 1)
+
+#define PROD_NUM_DESCS XSK_RING_PROD__DEFAULT_NUM_DESCS
+#define CONS_NUM_DESCS XSK_RING_CONS__DEFAULT_NUM_DESCS
+
+/* The worst case is all 4 queues TX/CQ/RX/FILL are full + some packets
+ * still on processing in threads. Number of packets currently in OVS
+ * processing is hard to estimate because it depends on number of ports.
+ * Setting NUM_FRAMES twice as large than total of ring sizes should be
+ * enough for most corner cases.
+ */
+#define NUM_FRAMES (4 * (PROD_NUM_DESCS + CONS_NUM_DESCS))
+#define BATCH_SIZE NETDEV_MAX_BURST
+
+BUILD_ASSERT_DECL(IS_POW2(NUM_FRAMES));
+BUILD_ASSERT_DECL(PROD_NUM_DESCS == CONS_NUM_DESCS);
+
+#define UMEM2DESC(elem, base) ((uint64_t)((char *)elem - (char *)base))
+
+static struct xsk_socket_info *xsk_configure(int ifindex, int xdp_queue_id,
+ int mode);
+static void xsk_remove_xdp_program(uint32_t ifindex, int xdpmode);
+static void xsk_destroy(struct xsk_socket_info *xsk);
+static int xsk_configure_all(struct netdev *netdev);
+static void xsk_destroy_all(struct netdev *netdev);
+
+struct unused_pool {
+ struct xsk_umem_info *umem_info;
+ int lost_in_rings; /* Number of packets left in tx, rx, cq and fq. */
+ struct ovs_list list_node;
+};
+
+static struct ovs_mutex unused_pools_mutex = OVS_MUTEX_INITIALIZER;
+static struct ovs_list unused_pools OVS_GUARDED_BY(unused_pools_mutex) =
+ OVS_LIST_INITIALIZER(&unused_pools);
+
+struct xsk_umem_info {
+ struct umem_pool mpool;
+ struct xpacket_pool xpool;
+ struct xsk_ring_prod fq;
+ struct xsk_ring_cons cq;
+ struct xsk_umem *umem;
+ void *buffer;
+};
+
+struct xsk_socket_info {
+ struct xsk_ring_cons rx;
+ struct xsk_ring_prod tx;
+ struct xsk_umem_info *umem;
+ struct xsk_socket *xsk;
+ uint32_t outstanding_tx; /* Number of descriptors filled in tx and cq. */
+ uint32_t available_rx; /* Number of descriptors filled in rx and fq. */
+ atomic_uint64_t tx_dropped;
+};
+
+static void
+netdev_afxdp_cleanup_unused_pool(struct unused_pool *pool)
+{
+ /* Free the packet buffer. */
+ free_pagealign(pool->umem_info->buffer);
+
+ /* Cleanup umem pool. */
+ umem_pool_cleanup(&pool->umem_info->mpool);
+
+ /* Cleanup metadata pool. */
+ xpacket_pool_cleanup(&pool->umem_info->xpool);
+
+ free(pool->umem_info);
+}
+
+static void
+netdev_afxdp_sweep_unused_pools(void *aux OVS_UNUSED)
+{
+ struct unused_pool *pool, *next;
+ unsigned int count;
+
+ ovs_mutex_lock(&unused_pools_mutex);
+ LIST_FOR_EACH_SAFE (pool, next, list_node, &unused_pools) {
+
+ count = umem_pool_count(&pool->umem_info->mpool);
+ ovs_assert(count + pool->lost_in_rings <= NUM_FRAMES);
+
+ if (count + pool->lost_in_rings == NUM_FRAMES) {
+ /* OVS doesn't use this memory pool anymore. Kernel doesn't
+ * use it since closing the xdp socket. So, it's safe to free
+ * the pool now. */
+ VLOG_DBG("Freeing umem pool at 0x%"PRIxPTR,
+ (uintptr_t) pool->umem_info);
+ ovs_list_remove(&pool->list_node);
+ netdev_afxdp_cleanup_unused_pool(pool);
+ free(pool);
+ }
+ }
+ ovs_mutex_unlock(&unused_pools_mutex);
+}
+
+static struct xsk_umem_info *
+xsk_configure_umem(void *buffer, uint64_t size, int xdpmode)
+{
+ struct xsk_umem_config uconfig;
+ struct xsk_umem_info *umem;
+ int ret;
+ int i;
+
+ umem = xzalloc(sizeof *umem);
+
+ uconfig.fill_size = PROD_NUM_DESCS;
+ uconfig.comp_size = CONS_NUM_DESCS;
+ uconfig.frame_size = FRAME_SIZE;
+ uconfig.frame_headroom = OVS_XDP_HEADROOM;
+
+ ret = xsk_umem__create(&umem->umem, buffer, size, &umem->fq, &umem->cq,
+ &uconfig);
+ if (ret) {
+ VLOG_ERR("xsk_umem__create failed (%s) mode: %s",
+ ovs_strerror(errno),
+ xdpmode == XDP_COPY ? "SKB": "DRV");
+ free(umem);
+ return NULL;
+ }
+
+ umem->buffer = buffer;
+
+ /* Set-up umem pool. */
+ if (umem_pool_init(&umem->mpool, NUM_FRAMES) < 0) {
+ VLOG_ERR("umem_pool_init failed");
+ if (xsk_umem__delete(umem->umem)) {
+ VLOG_ERR("xsk_umem__delete failed");
+ }
+ free(umem);
+ return NULL;
+ }
+
+ for (i = NUM_FRAMES - 1; i >= 0; i--) {
+ void *elem;
+
+ elem = ALIGNED_CAST(void *, (char *)umem->buffer + i * FRAME_SIZE);
+ umem_elem_push(&umem->mpool, elem);
+ }
+
+ /* Set-up metadata. */
+ if (xpacket_pool_init(&umem->xpool, NUM_FRAMES) < 0) {
+ VLOG_ERR("xpacket_pool_init failed");
+ umem_pool_cleanup(&umem->mpool);
+ if (xsk_umem__delete(umem->umem)) {
+ VLOG_ERR("xsk_umem__delete failed");
+ }
+ free(umem);
+ return NULL;
+ }
+
+ VLOG_DBG("%s: xpacket pool from %p to %p", __func__,
+ umem->xpool.array,
+ (char *)umem->xpool.array +
+ NUM_FRAMES * sizeof(struct dp_packet_afxdp));
+
+ for (i = NUM_FRAMES - 1; i >= 0; i--) {
+ struct dp_packet_afxdp *xpacket;
+ struct dp_packet *packet;
+
+ xpacket = &umem->xpool.array[i];
+ xpacket->mpool = &umem->mpool;
+
+ packet = &xpacket->packet;
+ packet->source = DPBUF_AFXDP;
+ }
+
+ return umem;
+}
+
+static struct xsk_socket_info *
+xsk_configure_socket(struct xsk_umem_info *umem, uint32_t ifindex,
+ uint32_t queue_id, int xdpmode)
+{
+ struct xsk_socket_config cfg;
+ struct xsk_socket_info *xsk;
+ char devname[IF_NAMESIZE];
+ uint32_t idx = 0, prog_id;
+ int ret;
+ int i;
+
+ xsk = xzalloc(sizeof *xsk);
+ xsk->umem = umem;
+ cfg.rx_size = CONS_NUM_DESCS;
+ cfg.tx_size = PROD_NUM_DESCS;
+ cfg.libbpf_flags = 0;
+
+ if (xdpmode == XDP_ZEROCOPY) {
+ cfg.bind_flags = XDP_ZEROCOPY;
+ cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_DRV_MODE;
+ } else {
+ cfg.bind_flags = XDP_COPY;
+ cfg.xdp_flags = XDP_FLAGS_UPDATE_IF_NOEXIST | XDP_FLAGS_SKB_MODE;
+ }
+
+ if (if_indextoname(ifindex, devname) == NULL) {
+ VLOG_ERR("ifindex %d to devname failed (%s)",
+ ifindex, ovs_strerror(errno));
+ free(xsk);
+ return NULL;
+ }
+
+ ret = xsk_socket__create(&xsk->xsk, devname, queue_id, umem->umem,
+ &xsk->rx, &xsk->tx, &cfg);
+ if (ret) {
+ VLOG_ERR("xsk_socket__create failed (%s) mode: %s qid: %d",
+ ovs_strerror(errno),
+ xdpmode == XDP_COPY ? "SKB": "DRV",
+ queue_id);
+ free(xsk);
+ return NULL;
+ }
+
+ /* Make sure the built-in AF_XDP program is loaded. */
+ ret = bpf_get_link_xdp_id(ifindex, &prog_id, cfg.xdp_flags);
+ if (ret) {
+ VLOG_ERR("Get XDP prog ID failed (%s)", ovs_strerror(errno));
+ xsk_socket__delete(xsk->xsk);
+ free(xsk);
+ return NULL;
+ }
+
+ while (!xsk_ring_prod__reserve(&xsk->umem->fq,
+ PROD_NUM_DESCS, &idx)) {
+ VLOG_WARN_RL(&rl, "Retry xsk_ring_prod__reserve to FILL queue");
+ }
+
+ for (i = 0;
+ i < PROD_NUM_DESCS * FRAME_SIZE;
+ i += FRAME_SIZE) {
+ void *elem;
+ uint64_t addr;
+
+ elem = umem_elem_pop(&xsk->umem->mpool);
+ addr = UMEM2DESC(elem, xsk->umem->buffer);
+
+ *xsk_ring_prod__fill_addr(&xsk->umem->fq, idx++) = addr;
+ }
+
+ xsk_ring_prod__submit(&xsk->umem->fq,
+ PROD_NUM_DESCS);
+ return xsk;
+}
+
+static struct xsk_socket_info *
+xsk_configure(int ifindex, int xdp_queue_id, int xdpmode)
+{
+ struct xsk_socket_info *xsk;
+ struct xsk_umem_info *umem;
+ void *bufs;
+
+ netdev_afxdp_sweep_unused_pools(NULL);
+
+ /* Umem memory region. */
+ bufs = xmalloc_pagealign(NUM_FRAMES * FRAME_SIZE);
+ memset(bufs, 0, NUM_FRAMES * FRAME_SIZE);
+
+ /* Create AF_XDP socket. */
+ umem = xsk_configure_umem(bufs,
+ NUM_FRAMES * FRAME_SIZE,
+ xdpmode);
+ if (!umem) {
+ free_pagealign(bufs);
+ return NULL;
+ }
+
+ VLOG_DBG("Allocated umem pool at 0x%"PRIxPTR, (uintptr_t) umem);
+
+ xsk = xsk_configure_socket(umem, ifindex, xdp_queue_id, xdpmode);
+ if (!xsk) {
+ /* Clean up umem and xpacket pool. */
+ if (xsk_umem__delete(umem->umem)) {
+ VLOG_ERR("xsk_umem__delete failed.");
+ }
+ free_pagealign(bufs);
+ umem_pool_cleanup(&umem->mpool);
+ xpacket_pool_cleanup(&umem->xpool);
+ free(umem);
+ }
+ return xsk;
+}
+
+static int
+xsk_configure_all(struct netdev *netdev)
+{
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+ struct xsk_socket_info *xsk_info;
+ int i, ifindex, n_rxq, n_txq;
+
+ ifindex = linux_get_ifindex(netdev_get_name(netdev));
+
+ ovs_assert(dev->xsks == NULL);
+ ovs_assert(dev->tx_locks == NULL);
+
+ n_rxq = netdev_n_rxq(netdev);
+ dev->xsks = xcalloc(n_rxq, sizeof *dev->xsks);
+
+ /* Configure each queue. */
+ for (i = 0; i < n_rxq; i++) {
+ VLOG_INFO("%s: configure queue %d mode %s", __func__, i,
+ dev->xdpmode == XDP_COPY ? "SKB" : "DRV");
+ xsk_info = xsk_configure(ifindex, i, dev->xdpmode);
+ if (!xsk_info) {
+ VLOG_ERR("Failed to create AF_XDP socket on queue %d.", i);
+ dev->xsks[i] = NULL;
+ goto err;
+ }
+ dev->xsks[i] = xsk_info;
+ atomic_init(&xsk_info->tx_dropped, 0);
+ xsk_info->outstanding_tx = 0;
+ xsk_info->available_rx = PROD_NUM_DESCS;
+ }
+
+ n_txq = netdev_n_txq(netdev);
+ dev->tx_locks = xcalloc(n_txq, sizeof *dev->tx_locks);
+
+ for (i = 0; i < n_txq; i++) {
+ ovs_spin_init(&dev->tx_locks[i]);
+ }
+
+ return 0;
+
+err:
+ xsk_destroy_all(netdev);
+ return EINVAL;
+}
+
+static void
+xsk_destroy(struct xsk_socket_info *xsk_info)
+{
+ struct xsk_umem *umem;
+ struct unused_pool *pool;
+
+ xsk_socket__delete(xsk_info->xsk);
+ xsk_info->xsk = NULL;
+
+ umem = xsk_info->umem->umem;
+ if (xsk_umem__delete(umem)) {
+ VLOG_ERR("xsk_umem__delete failed.");
+ }
+
+ pool = xzalloc(sizeof *pool);
+ pool->umem_info = xsk_info->umem;
+ pool->lost_in_rings = xsk_info->outstanding_tx + xsk_info->available_rx;
+
+ ovs_mutex_lock(&unused_pools_mutex);
+ ovs_list_push_back(&unused_pools, &pool->list_node);
+ ovs_mutex_unlock(&unused_pools_mutex);
+
+ free(xsk_info);
+
+ netdev_afxdp_sweep_unused_pools(NULL);
+}
+
+static void
+xsk_destroy_all(struct netdev *netdev)
+{
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+ int i, ifindex;
+
+ if (dev->xsks) {
+ for (i = 0; i < netdev_n_rxq(netdev); i++) {
+ if (dev->xsks[i]) {
+ xsk_destroy(dev->xsks[i]);
+ dev->xsks[i] = NULL;
+ VLOG_INFO("Destroyed xsk[%d].", i);
+ }
+ }
+
+ free(dev->xsks);
+ dev->xsks = NULL;
+ }
+
+ VLOG_INFO("%s: Removing xdp program.", netdev_get_name(netdev));
+ ifindex = linux_get_ifindex(netdev_get_name(netdev));
+ xsk_remove_xdp_program(ifindex, dev->xdpmode);
+
+ if (dev->tx_locks) {
+ for (i = 0; i < netdev_n_txq(netdev); i++) {
+ ovs_spin_destroy(&dev->tx_locks[i]);
+ }
+ free(dev->tx_locks);
+ dev->tx_locks = NULL;
+ }
+}
+
+static inline void OVS_UNUSED
+log_xsk_stat(struct xsk_socket_info *xsk OVS_UNUSED) {
+ struct xdp_statistics stat;
+ socklen_t optlen;
+
+ optlen = sizeof stat;
+ ovs_assert(getsockopt(xsk_socket__fd(xsk->xsk), SOL_XDP, XDP_STATISTICS,
+ &stat, &optlen) == 0);
+
+ VLOG_DBG_RL(&rl, "rx dropped %llu, rx_invalid %llu, tx_invalid %llu",
+ stat.rx_dropped,
+ stat.rx_invalid_descs,
+ stat.tx_invalid_descs);
+}
+
+int
+netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
+ char **errp OVS_UNUSED)
+{
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+ const char *str_xdpmode;
+ int xdpmode, new_n_rxq;
+
+ ovs_mutex_lock(&dev->mutex);
+ new_n_rxq = MAX(smap_get_int(args, "n_rxq", NR_QUEUE), 1);
+ if (new_n_rxq > MAX_XSKQ) {
+ ovs_mutex_unlock(&dev->mutex);
+ VLOG_ERR("%s: Too big 'n_rxq' (%d > %d).",
+ netdev_get_name(netdev), new_n_rxq, MAX_XSKQ);
+ return EINVAL;
+ }
+
+ str_xdpmode = smap_get_def(args, "xdpmode", "skb");
+ if (!strcasecmp(str_xdpmode, "drv")) {
+ xdpmode = XDP_ZEROCOPY;
+ } else if (!strcasecmp(str_xdpmode, "skb")) {
+ xdpmode = XDP_COPY;
+ } else {
+ VLOG_ERR("%s: Incorrect xdpmode (%s).",
+ netdev_get_name(netdev), str_xdpmode);
+ ovs_mutex_unlock(&dev->mutex);
+ return EINVAL;
+ }
+
+ if (dev->requested_n_rxq != new_n_rxq
+ || dev->requested_xdpmode != xdpmode) {
+ dev->requested_n_rxq = new_n_rxq;
+ dev->requested_xdpmode = xdpmode;
+ netdev_request_reconfigure(netdev);
+ }
+ ovs_mutex_unlock(&dev->mutex);
+ return 0;
+}
+
+int
+netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args)
+{
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+
+ ovs_mutex_lock(&dev->mutex);
+ smap_add_format(args, "n_rxq", "%d", netdev->n_rxq);
+ smap_add_format(args, "xdpmode", "%s",
+ dev->xdpmode == XDP_ZEROCOPY ? "drv" : "skb");
+ ovs_mutex_unlock(&dev->mutex);
+ return 0;
+}
+
+int
+netdev_afxdp_reconfigure(struct netdev *netdev)
+{
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+ struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+ int err = 0;
+
+ ovs_mutex_lock(&dev->mutex);
+
+ if (netdev->n_rxq == dev->requested_n_rxq
+ && dev->xdpmode == dev->requested_xdpmode) {
+ goto out;
+ }
+
+ xsk_destroy_all(netdev);
+
+ netdev->n_rxq = dev->requested_n_rxq;
+ netdev->n_txq = netdev->n_rxq;
+
+ if (dev->requested_xdpmode == XDP_ZEROCOPY) {
+ dev->xdpmode = XDP_ZEROCOPY;
+ VLOG_INFO("AF_XDP device %s in DRV mode.", netdev_get_name(netdev));
+ if (setrlimit(RLIMIT_MEMLOCK, &r)) {
+ VLOG_ERR("ERROR: setrlimit(RLIMIT_MEMLOCK): %s",
+ ovs_strerror(errno));
+ }
+ } else {
+ dev->xdpmode = XDP_COPY;
+ VLOG_INFO("AF_XDP device %s in SKB mode.", netdev_get_name(netdev));
+ /* TODO: set rlimit back to previous value
+ * when no device is in DRV mode.
+ */
+ }
+
+ err = xsk_configure_all(netdev);
+ if (err) {
+ VLOG_ERR("AF_XDP device %s reconfig failed.", netdev_get_name(netdev));
+ }
+ netdev_change_seq_changed(netdev);
+out:
+ ovs_mutex_unlock(&dev->mutex);
+ return err;
+}
+
+int
+netdev_afxdp_get_numa_id(const struct netdev *netdev)
+{
+ /* FIXME: Get netdev's PCIe device ID, then find
+ * its NUMA node id.
+ */
+ VLOG_INFO("FIXME: Device %s always use numa id 0.",
+ netdev_get_name(netdev));
+ return 0;
+}
+
+static void
+xsk_remove_xdp_program(uint32_t ifindex, int xdpmode)
+{
+ uint32_t flags;
+
+ flags = XDP_FLAGS_UPDATE_IF_NOEXIST;
+
+ if (xdpmode == XDP_COPY) {
+ flags |= XDP_FLAGS_SKB_MODE;
+ } else if (xdpmode == XDP_ZEROCOPY) {
+ flags |= XDP_FLAGS_DRV_MODE;
+ }
+
+ bpf_set_link_xdp_fd(ifindex, -1, flags);
+}
+
+void
+signal_remove_xdp(struct netdev *netdev)
+{
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+ int ifindex;
+
+ ifindex = linux_get_ifindex(netdev_get_name(netdev));
+
+ VLOG_WARN("Force removing xdp program.");
+ xsk_remove_xdp_program(ifindex, dev->xdpmode);
+}
+
+static struct dp_packet_afxdp *
+dp_packet_cast_afxdp(const struct dp_packet *d)
+{
+ ovs_assert(d->source == DPBUF_AFXDP);
+ return CONTAINER_OF(d, struct dp_packet_afxdp, packet);
+}
+
+static inline void
+prepare_fill_queue(struct xsk_socket_info *xsk_info)
+{
+ struct xsk_umem_info *umem;
+ void *elems[BATCH_SIZE];
+ unsigned int idx_fq;
+ int i, ret;
+
+ umem = xsk_info->umem;
+
+ if (xsk_prod_nb_free(&umem->fq, BATCH_SIZE) < BATCH_SIZE) {
+ return;
+ }
+
+ ret = umem_elem_pop_n(&umem->mpool, BATCH_SIZE, elems);
+ if (OVS_UNLIKELY(ret)) {
+ return;
+ }
+
+ if (!xsk_ring_prod__reserve(&umem->fq, BATCH_SIZE, &idx_fq)) {
+ umem_elem_push_n(&umem->mpool, BATCH_SIZE, elems);
+ COVERAGE_INC(afxdp_fq_full);
+ return;
+ }
+
+ for (i = 0; i < BATCH_SIZE; i++) {
+ uint64_t index;
+ void *elem;
+
+ elem = elems[i];
+ index = (uint64_t)((char *)elem - (char *)umem->buffer);
+ ovs_assert((index & FRAME_SHIFT_MASK) == 0);
+ *xsk_ring_prod__fill_addr(&umem->fq, idx_fq) = index;
+
+ idx_fq++;
+ }
+ xsk_ring_prod__submit(&umem->fq, BATCH_SIZE);
+ xsk_info->available_rx += BATCH_SIZE;
+}
+
+int
+netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_, struct dp_packet_batch *batch,
+ int *qfill)
+{
+ struct netdev_rxq_linux *rx = netdev_rxq_linux_cast(rxq_);
+ struct netdev *netdev = rx->up.netdev;
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+ struct xsk_socket_info *xsk_info;
+ struct xsk_umem_info *umem;
+ uint32_t idx_rx = 0;
+ int qid = rxq_->queue_id;
+ unsigned int rcvd, i;
+
+ xsk_info = dev->xsks[qid];
+ if (!xsk_info || !xsk_info->xsk) {
+ return EAGAIN;
+ }
+
+ prepare_fill_queue(xsk_info);
+
+ umem = xsk_info->umem;
+ rx->fd = xsk_socket__fd(xsk_info->xsk);
+
+ rcvd = xsk_ring_cons__peek(&xsk_info->rx, BATCH_SIZE, &idx_rx);
+ if (!rcvd) {
+ return EAGAIN;
+ }
+
+ /* Setup a dp_packet batch from descriptors in RX queue. */
+ for (i = 0; i < rcvd; i++) {
+ struct dp_packet_afxdp *xpacket;
+ const struct xdp_desc *desc;
+ struct dp_packet *packet;
+ uint64_t addr, index;
+ uint32_t len;
+ char *pkt;
+
+ desc = xsk_ring_cons__rx_desc(&xsk_info->rx, idx_rx);
+ addr = desc->addr;
+ len = desc->len;
+
+ pkt = xsk_umem__get_data(umem->buffer, addr);
+ index = addr >> FRAME_SHIFT;
+ xpacket = &umem->xpool.array[index];
+ packet = &xpacket->packet;
+
+ /* Initialize the struct dp_packet. */
+ dp_packet_use_afxdp(packet, pkt,
+ FRAME_SIZE - FRAME_HEADROOM,
+ OVS_XDP_HEADROOM);
+ dp_packet_set_size(packet, len);
+
+ /* Add packet into batch, increase batch->count. */
+ dp_packet_batch_add(batch, packet);
+
+ idx_rx++;
+ }
+ /* Release the RX queue. */
+ xsk_ring_cons__release(&xsk_info->rx, rcvd);
+ xsk_info->available_rx -= rcvd;
+
+ if (qfill) {
+ /* TODO: return the number of remaining packets in the queue. */
+ *qfill = 0;
+ }
+
+#ifdef AFXDP_DEBUG
+ log_xsk_stat(xsk_info);
+#endif
+ return 0;
+}
+
+static inline int
+kick_tx(struct xsk_socket_info *xsk_info, int xdpmode)
+{
+ int ret, retries;
+ static const int KERNEL_TX_BATCH_SIZE = 16;
+
+ /* In SKB_MODE packet transmission is synchronous, and the kernel xmits
+ * only TX_BATCH_SIZE(16) packets for a single sendmsg syscall.
+ * So, we have to kick the kernel (n_packets / 16) times to be sure that
+ * all packets are transmitted. */
+ retries = (xdpmode == XDP_COPY)
+ ? xsk_info->outstanding_tx / KERNEL_TX_BATCH_SIZE
+ : 0;
+kick_retry:
+ /* This causes system call into kernel's xsk_sendmsg, and
+ * xsk_generic_xmit (skb mode) or xsk_async_xmit (driver mode).
+ */
+ ret = sendto(xsk_socket__fd(xsk_info->xsk), NULL, 0, MSG_DONTWAIT,
+ NULL, 0);
+ if (ret < 0) {
+ if (retries-- && errno == EAGAIN) {
+ goto kick_retry;
+ }
+ if (errno == ENXIO || errno == ENOBUFS || errno == EOPNOTSUPP) {
+ return errno;
+ }
+ }
+ /* No error, or EBUSY, or too many retries on EAGAIN. */
+ return 0;
+}
+
+void
+free_afxdp_buf(struct dp_packet *p)
+{
+ struct dp_packet_afxdp *xpacket;
+ uintptr_t addr;
+
+ xpacket = dp_packet_cast_afxdp(p);
+ if (xpacket->mpool) {
+ void *base = dp_packet_base(p);
+
+ addr = (uintptr_t)base & (~FRAME_SHIFT_MASK);
+ umem_elem_push(xpacket->mpool, (void *)addr);
+ }
+}
+
+static void
+free_afxdp_buf_batch(struct dp_packet_batch *batch)
+{
+ struct dp_packet_afxdp *xpacket = NULL;
+ struct dp_packet *packet;
+ void *elems[BATCH_SIZE];
+ uintptr_t addr;
+
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ void *base;
+
+ xpacket = dp_packet_cast_afxdp(packet);
+ base = dp_packet_base(packet);
+ addr = (uintptr_t)base & (~FRAME_SHIFT_MASK);
+ elems[i] = (void *)addr;
+ }
+ umem_elem_push_n(xpacket->mpool, batch->count, elems);
+ dp_packet_batch_init(batch);
+}
+
+static inline bool
+check_free_batch(struct dp_packet_batch *batch)
+{
+ struct umem_pool *first_mpool = NULL;
+ struct dp_packet_afxdp *xpacket;
+ struct dp_packet *packet;
+
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ if (packet->source != DPBUF_AFXDP) {
+ return false;
+ }
+ xpacket = dp_packet_cast_afxdp(packet);
+ if (i == 0) {
+ first_mpool = xpacket->mpool;
+ continue;
+ }
+ if (xpacket->mpool != first_mpool) {
+ return false;
+ }
+ }
+ /* All packets are DPBUF_AFXDP and from the same mpool. */
+ return true;
+}
+
+static inline void
+afxdp_complete_tx(struct xsk_socket_info *xsk_info)
+{
+ void *elems_push[BATCH_SIZE];
+ struct xsk_umem_info *umem;
+ uint32_t idx_cq = 0;
+ int tx_to_free = 0;
+ int tx_done, j;
+
+ umem = xsk_info->umem;
+ tx_done = xsk_ring_cons__peek(&umem->cq, CONS_NUM_DESCS, &idx_cq);
+
+ /* Recycle back to umem pool. */
+ for (j = 0; j < tx_done; j++) {
+ uint64_t *addr;
+ void *elem;
+
+ addr = (uint64_t *)xsk_ring_cons__comp_addr(&umem->cq, idx_cq++);
+ if (*addr == UINT64_MAX) {
+ /* The elem has been pushed already. */
+ COVERAGE_INC(afxdp_cq_skip);
+ continue;
+ }
+ elem = ALIGNED_CAST(void *, (char *)umem->buffer + *addr);
+ elems_push[tx_to_free] = elem;
+ *addr = UINT64_MAX; /* Mark as pushed. */
+ tx_to_free++;
+
+ if (tx_to_free == BATCH_SIZE || j == tx_done - 1) {
+ umem_elem_push_n(&umem->mpool, tx_to_free, elems_push);
+ xsk_info->outstanding_tx -= tx_to_free;
+ tx_to_free = 0;
+ }
+ }
+
+ if (tx_done > 0) {
+ xsk_ring_cons__release(&umem->cq, tx_done);
+ } else {
+ COVERAGE_INC(afxdp_cq_empty);
+ }
+}
+
+static inline int
+__netdev_afxdp_batch_send(struct netdev *netdev, int qid,
+ struct dp_packet_batch *batch)
+{
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+ struct xsk_socket_info *xsk_info;
+ void *elems_pop[BATCH_SIZE];
+ struct xsk_umem_info *umem;
+ struct dp_packet *packet;
+ bool free_batch = false;
+ unsigned long orig;
+ uint32_t idx = 0;
+ int error = 0;
+ int ret;
+
+ xsk_info = dev->xsks[qid];
+ if (!xsk_info || !xsk_info->xsk) {
+ goto out;
+ }
+
+ afxdp_complete_tx(xsk_info);
+
+ free_batch = check_free_batch(batch);
+
+ umem = xsk_info->umem;
+ ret = umem_elem_pop_n(&umem->mpool, batch->count, elems_pop);
+ if (OVS_UNLIKELY(ret)) {
+ atomic_add_relaxed(&xsk_info->tx_dropped, batch->count, &orig);
+ VLOG_WARN_RL(&rl, "%s: send failed due to exhausted memory pool.",
+ netdev_get_name(netdev));
+ error = ENOMEM;
+ goto out;
+ }
+
+ /* Make sure we have enough TX descs. */
+ ret = xsk_ring_prod__reserve(&xsk_info->tx, batch->count, &idx);
+ if (OVS_UNLIKELY(ret == 0)) {
+ umem_elem_push_n(&umem->mpool, batch->count, elems_pop);
+ atomic_add_relaxed(&xsk_info->tx_dropped, batch->count, &orig);
+ COVERAGE_INC(afxdp_tx_full);
+ afxdp_complete_tx(xsk_info);
+ kick_tx(xsk_info, dev->xdpmode);
+ error = ENOMEM;
+ goto out;
+ }
+
+ DP_PACKET_BATCH_FOR_EACH (i, packet, batch) {
+ uint64_t index;
+ void *elem;
+
+ elem = elems_pop[i];
+ /* Copy the packet to the umem we just pop from umem pool.
+ * TODO: avoid this copy if the packet and the pop umem
+ * are located in the same umem.
+ */
+ memcpy(elem, dp_packet_data(packet), dp_packet_size(packet));
+
+ index = (uint64_t)((char *)elem - (char *)umem->buffer);
+ xsk_ring_prod__tx_desc(&xsk_info->tx, idx + i)->addr = index;
+ xsk_ring_prod__tx_desc(&xsk_info->tx, idx + i)->len
+ = dp_packet_size(packet);
+ }
+ xsk_ring_prod__submit(&xsk_info->tx, batch->count);
+ xsk_info->outstanding_tx += batch->count;
+
+ ret = kick_tx(xsk_info, dev->xdpmode);
+ if (OVS_UNLIKELY(ret)) {
+ VLOG_WARN_RL(&rl, "%s: error sending AF_XDP packet: %s.",
+ netdev_get_name(netdev), ovs_strerror(ret));
+ }
+
+out:
+ if (free_batch) {
+ free_afxdp_buf_batch(batch);
+ } else {
+ dp_packet_delete_batch(batch, true);
+ }
+
+ return error;
+}
+
+int
+netdev_afxdp_batch_send(struct netdev *netdev, int qid,
+ struct dp_packet_batch *batch,
+ bool concurrent_txq)
+{
+ struct netdev_linux *dev;
+ int ret;
+
+ if (concurrent_txq) {
+ dev = netdev_linux_cast(netdev);
+ qid = qid % netdev_n_txq(netdev);
+
+ ovs_spin_lock(&dev->tx_locks[qid]);
+ ret = __netdev_afxdp_batch_send(netdev, qid, batch);
+ ovs_spin_unlock(&dev->tx_locks[qid]);
+ } else {
+ ret = __netdev_afxdp_batch_send(netdev, qid, batch);
+ }
+
+ return ret;
+}
+
+int
+netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_ OVS_UNUSED)
+{
+ /* Done at reconfigure. */
+ return 0;
+}
+
+void
+netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_ OVS_UNUSED)
+{
+ /* Nothing. */
+}
+
+void
+netdev_afxdp_destruct(struct netdev *netdev)
+{
+ static struct ovsthread_once once = OVSTHREAD_ONCE_INITIALIZER;
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+
+ if (ovsthread_once_start(&once)) {
+ fatal_signal_add_hook(netdev_afxdp_sweep_unused_pools,
+ NULL, NULL, true);
+ ovsthread_once_done(&once);
+ }
+
+ /* Note: tc is by-passed when using drv-mode, but when using
+ * skb-mode, we might need to clean up tc. */
+
+ xsk_destroy_all(netdev);
+ ovs_mutex_destroy(&dev->mutex);
+}
+
+int
+netdev_afxdp_get_stats(const struct netdev *netdev,
+ struct netdev_stats *stats)
+{
+ struct netdev_linux *dev = netdev_linux_cast(netdev);
+ struct xsk_socket_info *xsk_info;
+ struct netdev_stats dev_stats;
+ int error, i;
+
+ ovs_mutex_lock(&dev->mutex);
+
+ error = get_stats_via_netlink(netdev, &dev_stats);
+ if (error) {
+ VLOG_WARN_RL(&rl, "%s: Error getting AF_XDP statistics.",
+ netdev_get_name(netdev));
+ } else {
+ /* Use kernel netdev's packet and byte counts. */
+ stats->rx_packets = dev_stats.rx_packets;
+ stats->rx_bytes = dev_stats.rx_bytes;
+ stats->tx_packets = dev_stats.tx_packets;
+ stats->tx_bytes = dev_stats.tx_bytes;
+
+ stats->rx_errors += dev_stats.rx_errors;
+ stats->tx_errors += dev_stats.tx_errors;
+ stats->rx_dropped += dev_stats.rx_dropped;
+ stats->tx_dropped += dev_stats.tx_dropped;
+ stats->multicast += dev_stats.multicast;
+ stats->collisions += dev_stats.collisions;
+ stats->rx_length_errors += dev_stats.rx_length_errors;
+ stats->rx_over_errors += dev_stats.rx_over_errors;
+ stats->rx_crc_errors += dev_stats.rx_crc_errors;
+ stats->rx_frame_errors += dev_stats.rx_frame_errors;
+ stats->rx_fifo_errors += dev_stats.rx_fifo_errors;
+ stats->rx_missed_errors += dev_stats.rx_missed_errors;
+ stats->tx_aborted_errors += dev_stats.tx_aborted_errors;
+ stats->tx_carrier_errors += dev_stats.tx_carrier_errors;
+ stats->tx_fifo_errors += dev_stats.tx_fifo_errors;
+ stats->tx_heartbeat_errors += dev_stats.tx_heartbeat_errors;
+ stats->tx_window_errors += dev_stats.tx_window_errors;
+
+ /* Account the dropped in each xsk. */
+ for (i = 0; i < netdev_n_rxq(netdev); i++) {
+ xsk_info = dev->xsks[i];
+ if (xsk_info) {
+ uint64_t tx_dropped;
+
+ atomic_read_relaxed(&xsk_info->tx_dropped, &tx_dropped);
+ stats->tx_dropped += tx_dropped;
+ }
+ }
+ }
+ ovs_mutex_unlock(&dev->mutex);
+
+ return error;
+}
diff --git a/lib/netdev-afxdp.h b/lib/netdev-afxdp.h
new file mode 100644
index 000000000..ea63d76cf
--- /dev/null
+++ b/lib/netdev-afxdp.h
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2018, 2019 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NETDEV_AFXDP_H
+#define NETDEV_AFXDP_H 1
+
+#ifdef HAVE_AF_XDP
+
+#include <stdint.h>
+#include <stdbool.h>
+
+/* These functions are Linux AF_XDP specific, so they should be used directly
+ * only by Linux-specific code. */
+
+struct netdev;
+struct xsk_socket_info;
+struct xdp_umem;
+struct dp_packet_batch;
+struct smap;
+struct dp_packet;
+struct netdev_rxq;
+struct netdev_stats;
+
+int netdev_afxdp_rxq_construct(struct netdev_rxq *rxq_);
+void netdev_afxdp_rxq_destruct(struct netdev_rxq *rxq_);
+void netdev_afxdp_destruct(struct netdev *netdev_);
+
+int netdev_afxdp_rxq_recv(struct netdev_rxq *rxq_,
+ struct dp_packet_batch *batch,
+ int *qfill);
+int netdev_afxdp_batch_send(struct netdev *netdev_, int qid,
+ struct dp_packet_batch *batch,
+ bool concurrent_txq);
+int netdev_afxdp_set_config(struct netdev *netdev, const struct smap *args,
+ char **errp);
+int netdev_afxdp_get_config(const struct netdev *netdev, struct smap *args);
+int netdev_afxdp_get_numa_id(const struct netdev *netdev);
+int netdev_afxdp_get_stats(const struct netdev *netdev_,
+ struct netdev_stats *stats);
+
+void free_afxdp_buf(struct dp_packet *p);
+int netdev_afxdp_reconfigure(struct netdev *netdev);
+void signal_remove_xdp(struct netdev *netdev);
+
+#else /* !HAVE_AF_XDP */
+
+#include "openvswitch/compiler.h"
+
+struct dp_packet;
+
+static inline void
+free_afxdp_buf(struct dp_packet *p OVS_UNUSED)
+{
+ /* Nothing. */
+}
+
+#endif /* HAVE_AF_XDP */
+#endif /* netdev-afxdp.h */
diff --git a/lib/netdev-linux-private.h b/lib/netdev-linux-private.h
new file mode 100644
index 000000000..ba87f9718
--- /dev/null
+++ b/lib/netdev-linux-private.h
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2019 Nicira, Inc.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at:
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef NETDEV_LINUX_PRIVATE_H
+#define NETDEV_LINUX_PRIVATE_H 1
+
+#include <linux/filter.h>
+#include <linux/gen_stats.h>
+#include <linux/if_ether.h>
+#include <linux/if_tun.h>
+#include <linux/types.h>
+#include <linux/ethtool.h>
+#include <linux/mii.h>
+#include <stdint.h>
+#include <stdbool.h>
+
+#include "netdev-afxdp.h"
+#include "netdev-afxdp-pool.h"
+#include "netdev-provider.h"
+#include "netdev-vport.h"
+#include "openvswitch/thread.h"
+#include "ovs-atomic.h"
+#include "timer.h"
+
+struct netdev;
+
+struct netdev_rxq_linux {
+ struct netdev_rxq up;
+ bool is_tap;
+ int fd;
+};
+
+void netdev_linux_run(const struct netdev_class *);
+
+int get_stats_via_netlink(const struct netdev *netdev_,
+ struct netdev_stats *stats);
+
+struct netdev_linux {
+ struct netdev up;
+
+ /* Protects all members below. */
+ struct ovs_mutex mutex;
+
+ unsigned int cache_valid;
+
+ bool miimon; /* Link status of last poll. */
+ long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
+ struct timer miimon_timer;
+
+ int netnsid; /* Network namespace ID. */
+ /* The following are figured out "on demand" only. They are only valid
+ * when the corresponding VALID_* bit in 'cache_valid' is set. */
+ int ifindex;
+ struct eth_addr etheraddr;
+ int mtu;
+ unsigned int ifi_flags;
+ long long int carrier_resets;
+ uint32_t kbits_rate; /* Policing data. */
+ uint32_t kbits_burst;
+ int vport_stats_error; /* Cached error code from vport_get_stats().
+ 0 or an errno value. */
+ int netdev_mtu_error; /* Cached error code from SIOCGIFMTU
+ * or SIOCSIFMTU.
+ */
+ int ether_addr_error; /* Cached error code from set/get etheraddr. */
+ int netdev_policing_error; /* Cached error code from set policing. */
+ int get_features_error; /* Cached error code from ETHTOOL_GSET. */
+ int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
+
+ enum netdev_features current; /* Cached from ETHTOOL_GSET. */
+ enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
+ enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
+
+ struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
+ struct tc *tc;
+
+ /* For devices of class netdev_tap_class only. */
+ int tap_fd;
+ bool present; /* If the device is present in the namespace */
+ uint64_t tx_dropped; /* tap device can drop if the iface is down */
+
+ /* LAG information. */
+ bool is_lag_master; /* True if the netdev is a LAG master. */
+
+#ifdef HAVE_AF_XDP
+ /* AF_XDP information. */
+ struct xsk_socket_info **xsks;
+ int requested_n_rxq;
+ int xdpmode; /* AF_XDP running mode: driver or skb. */
+ int requested_xdpmode;
+ struct ovs_spin *tx_locks; /* spin lock array for TX queues. */
+#endif
+};
+
+static bool
+is_netdev_linux_class(const struct netdev_class *netdev_class)
+{
+ return netdev_class->run == netdev_linux_run;
+}
+
+static struct netdev_linux *
+netdev_linux_cast(const struct netdev *netdev)
+{
+ ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
+
+ return CONTAINER_OF(netdev, struct netdev_linux, up);
+}
+
+static struct netdev_rxq_linux *
+netdev_rxq_linux_cast(const struct netdev_rxq *rx)
+{
+ ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
+
+ return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
+}
+
+#endif /* netdev-linux-private.h */
diff --git a/lib/netdev-linux.c b/lib/netdev-linux.c
index e4ea94cf9..877049508 100644
--- a/lib/netdev-linux.c
+++ b/lib/netdev-linux.c
@@ -17,6 +17,7 @@
#include <config.h>
#include "netdev-linux.h"
+#include "netdev-linux-private.h"
#include <errno.h>
#include <fcntl.h>
@@ -54,6 +55,7 @@
#include "fatal-signal.h"
#include "hash.h"
#include "openvswitch/hmap.h"
+#include "netdev-afxdp.h"
#include "netdev-provider.h"
#include "netdev-vport.h"
#include "netlink-notifier.h"
@@ -486,57 +488,6 @@ static int tc_calc_cell_log(unsigned int mtu);
static void tc_fill_rate(struct tc_ratespec *rate, uint64_t bps, int mtu);
static int tc_calc_buffer(unsigned int Bps, int mtu, uint64_t burst_bytes);
-struct netdev_linux {
- struct netdev up;
-
- /* Protects all members below. */
- struct ovs_mutex mutex;
-
- unsigned int cache_valid;
-
- bool miimon; /* Link status of last poll. */
- long long int miimon_interval; /* Miimon Poll rate. Disabled if <= 0. */
- struct timer miimon_timer;
-
- int netnsid; /* Network namespace ID. */
- /* The following are figured out "on demand" only. They are only valid
- * when the corresponding VALID_* bit in 'cache_valid' is set. */
- int ifindex;
- struct eth_addr etheraddr;
- int mtu;
- unsigned int ifi_flags;
- long long int carrier_resets;
- uint32_t kbits_rate; /* Policing data. */
- uint32_t kbits_burst;
- int vport_stats_error; /* Cached error code from vport_get_stats().
- 0 or an errno value. */
- int netdev_mtu_error; /* Cached error code from SIOCGIFMTU or SIOCSIFMTU. */
- int ether_addr_error; /* Cached error code from set/get etheraddr. */
- int netdev_policing_error; /* Cached error code from set policing. */
- int get_features_error; /* Cached error code from ETHTOOL_GSET. */
- int get_ifindex_error; /* Cached error code from SIOCGIFINDEX. */
-
- enum netdev_features current; /* Cached from ETHTOOL_GSET. */
- enum netdev_features advertised; /* Cached from ETHTOOL_GSET. */
- enum netdev_features supported; /* Cached from ETHTOOL_GSET. */
-
- struct ethtool_drvinfo drvinfo; /* Cached from ETHTOOL_GDRVINFO. */
- struct tc *tc;
-
- /* For devices of class netdev_tap_class only. */
- int tap_fd;
- bool present; /* If the device is present in the namespace */
- uint64_t tx_dropped; /* tap device can drop if the iface is down */
-
- /* LAG information. */
- bool is_lag_master; /* True if the netdev is a LAG master. */
-};
-
-struct netdev_rxq_linux {
- struct netdev_rxq up;
- bool is_tap;
- int fd;
-};
/* This is set pretty low because we probably won't learn anything from the
* additional log messages. */
@@ -550,8 +501,6 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
* changes in the device miimon status, so we can use atomic_count. */
static atomic_count miimon_cnt = ATOMIC_COUNT_INIT(0);
-static void netdev_linux_run(const struct netdev_class *);
-
static int netdev_linux_do_ethtool(const char *name, struct ethtool_cmd *,
int cmd, const char *cmd_name);
static int get_flags(const struct netdev *, unsigned int *flags);
@@ -565,7 +514,6 @@ static int do_set_addr(struct netdev *netdev,
struct in_addr addr);
static int get_etheraddr(const char *netdev_name, struct eth_addr *ea);
static int set_etheraddr(const char *netdev_name, const struct eth_addr);
-static int get_stats_via_netlink(const struct netdev *, struct netdev_stats *);
static int af_packet_sock(void);
static bool netdev_linux_miimon_enabled(void);
static void netdev_linux_miimon_run(void);
@@ -573,31 +521,10 @@ static void netdev_linux_miimon_wait(void);
static int netdev_linux_get_mtu__(struct netdev_linux *netdev, int *mtup);
static bool
-is_netdev_linux_class(const struct netdev_class *netdev_class)
-{
- return netdev_class->run == netdev_linux_run;
-}
-
-static bool
is_tap_netdev(const struct netdev *netdev)
{
return netdev_get_class(netdev) == &netdev_tap_class;
}
-
-static struct netdev_linux *
-netdev_linux_cast(const struct netdev *netdev)
-{
- ovs_assert(is_netdev_linux_class(netdev_get_class(netdev)));
-
- return CONTAINER_OF(netdev, struct netdev_linux, up);
-}
-
-static struct netdev_rxq_linux *
-netdev_rxq_linux_cast(const struct netdev_rxq *rx)
-{
- ovs_assert(is_netdev_linux_class(netdev_get_class(rx->netdev)));
- return CONTAINER_OF(rx, struct netdev_rxq_linux, up);
-}
static int
netdev_linux_netnsid_update__(struct netdev_linux *netdev)
@@ -773,7 +700,7 @@ netdev_linux_update_lag(struct rtnetlink_change *change)
}
}
-static void
+void
netdev_linux_run(const struct netdev_class *netdev_class OVS_UNUSED)
{
struct nl_sock *sock;
@@ -3278,9 +3205,7 @@ exit:
.run = netdev_linux_run, \
.wait = netdev_linux_wait, \
.alloc = netdev_linux_alloc, \
- .destruct = netdev_linux_destruct, \
.dealloc = netdev_linux_dealloc, \
- .send = netdev_linux_send, \
.send_wait = netdev_linux_send_wait, \
.set_etheraddr = netdev_linux_set_etheraddr, \
.get_etheraddr = netdev_linux_get_etheraddr, \
@@ -3311,39 +3236,74 @@ exit:
.arp_lookup = netdev_linux_arp_lookup, \
.update_flags = netdev_linux_update_flags, \
.rxq_alloc = netdev_linux_rxq_alloc, \
- .rxq_construct = netdev_linux_rxq_construct, \
- .rxq_destruct = netdev_linux_rxq_destruct, \
.rxq_dealloc = netdev_linux_rxq_dealloc, \
- .rxq_recv = netdev_linux_rxq_recv, \
.rxq_wait = netdev_linux_rxq_wait, \
.rxq_drain = netdev_linux_rxq_drain
const struct netdev_class netdev_linux_class = {
NETDEV_LINUX_CLASS_COMMON,
.type = "system",
+ .is_pmd = false,
.construct = netdev_linux_construct,
+ .destruct = netdev_linux_destruct,
.get_stats = netdev_linux_get_stats,
.get_features = netdev_linux_get_features,
.get_status = netdev_linux_get_status,
- .get_block_id = netdev_linux_get_block_id
+ .get_block_id = netdev_linux_get_block_id,
+ .send = netdev_linux_send,
+ .rxq_construct = netdev_linux_rxq_construct,
+ .rxq_destruct = netdev_linux_rxq_destruct,
+ .rxq_recv = netdev_linux_rxq_recv,
};
const struct netdev_class netdev_tap_class = {
NETDEV_LINUX_CLASS_COMMON,
.type = "tap",
+ .is_pmd = false,
.construct = netdev_linux_construct_tap,
+ .destruct = netdev_linux_destruct,
.get_stats = netdev_tap_get_stats,
.get_features = netdev_linux_get_features,
.get_status = netdev_linux_get_status,
+ .send = netdev_linux_send,
+ .rxq_construct = netdev_linux_rxq_construct,
+ .rxq_destruct = netdev_linux_rxq_destruct,
+ .rxq_recv = netdev_linux_rxq_recv,
};
const struct netdev_class netdev_internal_class = {
NETDEV_LINUX_CLASS_COMMON,
.type = "internal",
+ .is_pmd = false,
.construct = netdev_linux_construct,
+ .destruct = netdev_linux_destruct,
.get_stats = netdev_internal_get_stats,
.get_status = netdev_internal_get_status,
+ .send = netdev_linux_send,
+ .rxq_construct = netdev_linux_rxq_construct,
+ .rxq_destruct = netdev_linux_rxq_destruct,
+ .rxq_recv = netdev_linux_rxq_recv,
};
+
+#ifdef HAVE_AF_XDP
+const struct netdev_class netdev_afxdp_class = {
+ NETDEV_LINUX_CLASS_COMMON,
+ .type = "afxdp",
+ .is_pmd = true,
+ .construct = netdev_linux_construct,
+ .destruct = netdev_afxdp_destruct,
+ .get_stats = netdev_afxdp_get_stats,
+ .get_status = netdev_linux_get_status,
+ .set_config = netdev_afxdp_set_config,
+ .get_config = netdev_afxdp_get_config,
+ .reconfigure = netdev_afxdp_reconfigure,
+ .get_numa_id = netdev_afxdp_get_numa_id,
+ .send = netdev_afxdp_batch_send,
+ .rxq_construct = netdev_afxdp_rxq_construct,
+ .rxq_destruct = netdev_afxdp_rxq_destruct,
+ .rxq_recv = netdev_afxdp_rxq_recv,
+};
+#endif
#define CODEL_N_QUEUES 0x0000
@@ -5915,7 +5875,7 @@ netdev_stats_from_rtnl_link_stats64(struct netdev_stats *dst,
dst->tx_window_errors = src->tx_window_errors;
}
-static int
+int
get_stats_via_netlink(const struct netdev *netdev_, struct netdev_stats *stats)
{
struct ofpbuf request;
diff --git a/lib/netdev-provider.h b/lib/netdev-provider.h
index 2a545c986..1e5a40c89 100644
--- a/lib/netdev-provider.h
+++ b/lib/netdev-provider.h
@@ -832,6 +832,9 @@ extern const struct netdev_class netdev_linux_class;
extern const struct netdev_class netdev_internal_class;
extern const struct netdev_class netdev_tap_class;
+#ifdef HAVE_AF_XDP
+extern const struct netdev_class netdev_afxdp_class;
+#endif
#ifdef __cplusplus
}
#endif
diff --git a/lib/netdev.c b/lib/netdev.c
index 6b34dec9c..b1976d365 100644
--- a/lib/netdev.c
+++ b/lib/netdev.c
@@ -103,6 +103,9 @@ static struct vlog_rate_limit rl = VLOG_RATE_LIMIT_INIT(5, 20);
static void restore_all_flags(void *aux OVS_UNUSED);
void update_device_args(struct netdev *, const struct shash *args);
+#ifdef HAVE_AF_XDP
+void signal_remove_xdp(struct netdev *netdev);
+#endif
int
netdev_n_txq(const struct netdev *netdev)
@@ -147,6 +150,9 @@ netdev_initialize(void)
netdev_vport_tunnel_register();
netdev_register_flow_api_provider(&netdev_offload_tc);
+#ifdef HAVE_AF_XDP
+ netdev_register_provider(&netdev_afxdp_class);
+#endif
#endif
#if defined(__FreeBSD__) || defined(__NetBSD__)
netdev_register_provider(&netdev_tap_class);
@@ -2021,6 +2027,11 @@ restore_all_flags(void *aux OVS_UNUSED)
saved_flags & ~saved_values,
&old_flags);
}
+#ifdef HAVE_AF_XDP
+ if (netdev->netdev_class == &netdev_afxdp_class) {
+ signal_remove_xdp(netdev);
+ }
+#endif
}
}
diff --git a/lib/util.c b/lib/util.c
index 7b8ab81f6..830e14516 100644
--- a/lib/util.c
+++ b/lib/util.c
@@ -214,20 +214,19 @@ x2nrealloc(void *p, size_t *n, size_t s)
return xrealloc(p, *n * s);
}
-/* Allocates and returns 'size' bytes of memory aligned to a cache line and in
- * dedicated cache lines. That is, the memory block returned will not share a
- * cache line with other data, avoiding "false sharing".
+/* Allocates and returns 'size' bytes of memory aligned to 'alignment' bytes.
+ * 'alignment' must be a power of two and a multiple of sizeof(void *).
*
- * Use free_cacheline() to free the returned memory block. */
+ * Use free_size_align() to free the returned memory block. */
void *
-xmalloc_cacheline(size_t size)
+xmalloc_size_align(size_t size, size_t alignment)
{
#ifdef HAVE_POSIX_MEMALIGN
void *p;
int error;
COVERAGE_INC(util_xalloc);
- error = posix_memalign(&p, CACHE_LINE_SIZE, size ? size : 1);
+ error = posix_memalign(&p, alignment, size ? size : 1);
if (error != 0) {
out_of_memory();
}
@@ -235,16 +234,16 @@ xmalloc_cacheline(size_t size)
#else
/* Allocate room for:
*
- * - Header padding: Up to CACHE_LINE_SIZE - 1 bytes, to allow the
- * pointer to be aligned exactly sizeof(void *) bytes before the
- * beginning of a cache line.
+ * - Header padding: Up to alignment - 1 bytes, to allow the
+ * pointer 'q' to be aligned exactly sizeof(void *) bytes before the
+ * beginning of the alignment.
*
* - Pointer: A pointer to the start of the header padding, to allow us
* to free() the block later.
*
* - User data: 'size' bytes.
*
- * - Trailer padding: Enough to bring the user data up to a cache line
+ * - Trailer padding: Enough to bring the user data up to a alignment
* multiple.
*
* +---------------+---------+------------------------+---------+
@@ -255,18 +254,55 @@ xmalloc_cacheline(size_t size)
* p q r
*
*/
- void *p = xmalloc((CACHE_LINE_SIZE - 1)
- + sizeof(void *)
- + ROUND_UP(size, CACHE_LINE_SIZE));
- bool runt = PAD_SIZE((uintptr_t) p, CACHE_LINE_SIZE) < sizeof(void *);
- void *r = (void *) ROUND_UP((uintptr_t) p + (runt ? CACHE_LINE_SIZE : 0),
- CACHE_LINE_SIZE);
- void **q = (void **) r - 1;
+ void *p, *r, **q;
+ bool runt;
+
+ if (!IS_POW2(alignment) || (alignment % sizeof(void *) != 0)) {
+ ovs_abort(0, "Invalid alignment");
+ }
+
+ p = xmalloc((alignment - 1)
+ + sizeof(void *)
+ + ROUND_UP(size, alignment));
+
+ runt = PAD_SIZE((uintptr_t) p, alignment) < sizeof(void *);
+ /* When the padding size < sizeof(void*), we don't have enough room for
+ * pointer 'q'. As a reuslt, need to move 'r' to the next alignment.
+ * So ROUND_UP when xmalloc above, and ROUND_UP again when calculate 'r'
+ * below.
+ */
+ r = (void *) ROUND_UP((uintptr_t) p + (runt ? alignment : 0), alignment);
+ q = (void **) r - 1;
*q = p;
+
return r;
#endif
}
+void
+free_size_align(void *p)
+{
+#ifdef HAVE_POSIX_MEMALIGN
+ free(p);
+#else
+ if (p) {
+ void **q = (void **) p - 1;
+ free(*q);
+ }
+#endif
+}
+
+/* Allocates and returns 'size' bytes of memory aligned to a cache line and in
+ * dedicated cache lines. That is, the memory block returned will not share a
+ * cache line with other data, avoiding "false sharing".
+ *
+ * Use free_cacheline() to free the returned memory block. */
+void *
+xmalloc_cacheline(size_t size)
+{
+ return xmalloc_size_align(size, CACHE_LINE_SIZE);
+}
+
/* Like xmalloc_cacheline() but clears the allocated memory to all zero
* bytes. */
void *
@@ -282,14 +318,19 @@ xzalloc_cacheline(size_t size)
void
free_cacheline(void *p)
{
-#ifdef HAVE_POSIX_MEMALIGN
- free(p);
-#else
- if (p) {
- void **q = (void **) p - 1;
- free(*q);
- }
-#endif
+ free_size_align(p);
+}
+
+void *
+xmalloc_pagealign(size_t size)
+{
+ return xmalloc_size_align(size, get_page_size());
+}
+
+void
+free_pagealign(void *p)
+{
+ free_size_align(p);
}
char *
diff --git a/lib/util.h b/lib/util.h
index 095ede20f..7ad8758fe 100644
--- a/lib/util.h
+++ b/lib/util.h
@@ -169,6 +169,11 @@ void ovs_strzcpy(char *dst, const char *src, size_t size);
int string_ends_with(const char *str, const char *suffix);
+void *xmalloc_pagealign(size_t) MALLOC_LIKE;
+void free_pagealign(void *);
+void *xmalloc_size_align(size_t, size_t) MALLOC_LIKE;
+void free_size_align(void *);
+
/* The C standards say that neither the 'dst' nor 'src' argument to
* memcpy() may be null, even if 'n' is zero. This wrapper tolerates
* the null case. */