From 5ab073ffd326480a6185d096e9703f62ef92b86c Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:26 +0200 Subject: xdp: introduce xdp_return_frame API and use in cpumap Introduce an xdp_return_frame API, and convert over cpumap as the first user, given it have queued XDP frame structure to leverage. V3: Cleanup and remove C99 style comments, pointed out by Alex Duyck. V6: Remove comment that id will be added later (Req by Alex Duyck) V8: Rename enum mem_type to xdp_mem_type (found by kbuild test robot) Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 60 ++++++++++++++++++++++++++++++++--------------------- 1 file changed, 36 insertions(+), 24 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index a4bb0b34375a..3e4bbcbe3e86 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -137,27 +138,6 @@ free_cmap: return ERR_PTR(err); } -static void __cpu_map_queue_destructor(void *ptr) -{ - /* The tear-down procedure should have made sure that queue is - * empty. See __cpu_map_entry_replace() and work-queue - * invoked cpu_map_kthread_stop(). Catch any broken behaviour - * gracefully and warn once. - */ - if (WARN_ON_ONCE(ptr)) - page_frag_free(ptr); -} - -static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) -{ - if (atomic_dec_and_test(&rcpu->refcnt)) { - /* The queue should be empty at this point */ - ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor); - kfree(rcpu->queue); - kfree(rcpu); - } -} - static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) { atomic_inc(&rcpu->refcnt); @@ -188,6 +168,10 @@ struct xdp_pkt { u16 len; u16 headroom; u16 metasize; + /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, + * while mem info is valid on remote CPU. + */ + struct xdp_mem_info mem; struct net_device *dev_rx; }; @@ -213,6 +197,9 @@ static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); xdp_pkt->metasize = metasize; + /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ + xdp_pkt->mem = xdp->rxq->mem; + return xdp_pkt; } @@ -265,6 +252,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, return skb; } +static void __cpu_map_ring_cleanup(struct ptr_ring *ring) +{ + /* The tear-down procedure should have made sure that queue is + * empty. See __cpu_map_entry_replace() and work-queue + * invoked cpu_map_kthread_stop(). Catch any broken behaviour + * gracefully and warn once. + */ + struct xdp_pkt *xdp_pkt; + + while ((xdp_pkt = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdp_pkt)) + xdp_return_frame(xdp_pkt, &xdp_pkt->mem); +} + +static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) +{ + if (atomic_dec_and_test(&rcpu->refcnt)) { + /* The queue should be empty at this point */ + __cpu_map_ring_cleanup(rcpu->queue); + ptr_ring_cleanup(rcpu->queue, NULL); + kfree(rcpu->queue); + kfree(rcpu); + } +} + static int cpu_map_kthread_run(void *data) { struct bpf_cpu_map_entry *rcpu = data; @@ -307,7 +319,7 @@ static int cpu_map_kthread_run(void *data) skb = cpu_map_build_skb(rcpu, xdp_pkt); if (!skb) { - page_frag_free(xdp_pkt); + xdp_return_frame(xdp_pkt, &xdp_pkt->mem); continue; } @@ -604,13 +616,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - void *xdp_pkt = bq->q[i]; + struct xdp_pkt *xdp_pkt = bq->q[i]; int err; err = __ptr_ring_produce(q, xdp_pkt); if (err) { drops++; - page_frag_free(xdp_pkt); /* Free xdp_pkt */ + xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem); } processed++; } -- cgit v1.2.1 From 70280ed91cb8acb43e8fd7a8094840846c172ac5 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:45:57 +0200 Subject: bpf: cpumap convert to use generic xdp_frame The generic xdp_frame format, was inspired by the cpumap own internal xdp_pkt format. It is now time to convert it over to the generic xdp_frame format. The cpumap needs one extra field dev_rx. Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 100 +++++++++++++++------------------------------------- 1 file changed, 28 insertions(+), 72 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index 3e4bbcbe3e86..bcdc4dea5ce7 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -159,52 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work) kthread_stop(rcpu->kthread); } -/* For now, xdp_pkt is a cpumap internal data structure, with info - * carried between enqueue to dequeue. It is mapped into the top - * headroom of the packet, to avoid allocating separate mem. - */ -struct xdp_pkt { - void *data; - u16 len; - u16 headroom; - u16 metasize; - /* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time, - * while mem info is valid on remote CPU. - */ - struct xdp_mem_info mem; - struct net_device *dev_rx; -}; - -/* Convert xdp_buff to xdp_pkt */ -static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp) -{ - struct xdp_pkt *xdp_pkt; - int metasize; - int headroom; - - /* Assure headroom is available for storing info */ - headroom = xdp->data - xdp->data_hard_start; - metasize = xdp->data - xdp->data_meta; - metasize = metasize > 0 ? metasize : 0; - if (unlikely((headroom - metasize) < sizeof(*xdp_pkt))) - return NULL; - - /* Store info in top of packet */ - xdp_pkt = xdp->data_hard_start; - - xdp_pkt->data = xdp->data; - xdp_pkt->len = xdp->data_end - xdp->data; - xdp_pkt->headroom = headroom - sizeof(*xdp_pkt); - xdp_pkt->metasize = metasize; - - /* rxq only valid until napi_schedule ends, convert to xdp_mem_info */ - xdp_pkt->mem = xdp->rxq->mem; - - return xdp_pkt; -} - static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, - struct xdp_pkt *xdp_pkt) + struct xdp_frame *xdpf) { unsigned int frame_size; void *pkt_data_start; @@ -219,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * would be preferred to set frame_size to 2048 or 4096 * depending on the driver. * frame_size = 2048; - * frame_len = frame_size - sizeof(*xdp_pkt); + * frame_len = frame_size - sizeof(*xdp_frame); * * Instead, with info avail, skb_shared_info in placed after * packet len. This, unfortunately fakes the truesize. @@ -227,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu, * is not at a fixed memory location, with mixed length * packets, which is bad for cache-line hotness. */ - frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom + + frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - pkt_data_start = xdp_pkt->data - xdp_pkt->headroom; + pkt_data_start = xdpf->data - xdpf->headroom; skb = build_skb(pkt_data_start, frame_size); if (!skb) return NULL; - skb_reserve(skb, xdp_pkt->headroom); - __skb_put(skb, xdp_pkt->len); - if (xdp_pkt->metasize) - skb_metadata_set(skb, xdp_pkt->metasize); + skb_reserve(skb, xdpf->headroom); + __skb_put(skb, xdpf->len); + if (xdpf->metasize) + skb_metadata_set(skb, xdpf->metasize); /* Essential SKB info: protocol and skb->dev */ - skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx); + skb->protocol = eth_type_trans(skb, xdpf->dev_rx); /* Optional SKB info, currently missing: * - HW checksum info (skb->ip_summed) @@ -259,11 +215,11 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) * invoked cpu_map_kthread_stop(). Catch any broken behaviour * gracefully and warn once. */ - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - while ((xdp_pkt = ptr_ring_consume(ring))) - if (WARN_ON_ONCE(xdp_pkt)) - xdp_return_frame(xdp_pkt, &xdp_pkt->mem); + while ((xdpf = ptr_ring_consume(ring))) + if (WARN_ON_ONCE(xdpf)) + xdp_return_frame(xdpf->data, &xdpf->mem); } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -290,7 +246,7 @@ static int cpu_map_kthread_run(void *data) */ while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) { unsigned int processed = 0, drops = 0, sched = 0; - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; /* Release CPU reschedule checks */ if (__ptr_ring_empty(rcpu->queue)) { @@ -313,13 +269,13 @@ static int cpu_map_kthread_run(void *data) * kthread CPU pinned. Lockless access to ptr_ring * consume side valid as no-resize allowed of queue. */ - while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) { + while ((xdpf = __ptr_ring_consume(rcpu->queue))) { struct sk_buff *skb; int ret; - skb = cpu_map_build_skb(rcpu, xdp_pkt); + skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - xdp_return_frame(xdp_pkt, &xdp_pkt->mem); + xdp_return_frame(xdpf->data, &xdpf->mem); continue; } @@ -616,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, spin_lock(&q->producer_lock); for (i = 0; i < bq->count; i++) { - struct xdp_pkt *xdp_pkt = bq->q[i]; + struct xdp_frame *xdpf = bq->q[i]; int err; - err = __ptr_ring_produce(q, xdp_pkt); + err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdp_pkt->data, &xdp_pkt->mem); + xdp_return_frame(xdpf->data, &xdpf->mem); } processed++; } @@ -637,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, /* Runs under RCU-read-side, plus in softirq under NAPI protection. * Thus, safe percpu variable access. */ -static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) +static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf) { struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq); @@ -648,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt) * driver to code invoking us to finished, due to driver * (e.g. ixgbe) recycle tricks based on page-refcnt. * - * Thus, incoming xdp_pkt is always queued here (else we race + * Thus, incoming xdp_frame is always queued here (else we race * with another CPU on page-refcnt and remaining driver code). * Queue time is very short, as driver will invoke flush * operation, when completing napi->poll call. */ - bq->q[bq->count++] = xdp_pkt; + bq->q[bq->count++] = xdpf; return 0; } int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp, struct net_device *dev_rx) { - struct xdp_pkt *xdp_pkt; + struct xdp_frame *xdpf; - xdp_pkt = convert_to_xdp_pkt(xdp); - if (unlikely(!xdp_pkt)) + xdpf = convert_to_xdp_frame(xdp); + if (unlikely(!xdpf)) return -EOVERFLOW; /* Info needed when constructing SKB on remote CPU */ - xdp_pkt->dev_rx = dev_rx; + xdpf->dev_rx = dev_rx; - bq_enqueue(rcpu, xdp_pkt); + bq_enqueue(rcpu, xdpf); return 0; } -- cgit v1.2.1 From 039930945a72d9af5ff04ae9b9e60658a52e0770 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Tue, 17 Apr 2018 16:46:32 +0200 Subject: xdp: transition into using xdp_frame for return API Changing API xdp_return_frame() to take struct xdp_frame as argument, seems like a natural choice. But there are some subtle performance details here that needs extra care, which is a deliberate choice. When de-referencing xdp_frame on a remote CPU during DMA-TX completion, result in the cache-line is change to "Shared" state. Later when the page is reused for RX, then this xdp_frame cache-line is written, which change the state to "Modified". This situation already happens (naturally) for, virtio_net, tun and cpumap as the xdp_frame pointer is the queued object. In tun and cpumap, the ptr_ring is used for efficiently transferring cache-lines (with pointers) between CPUs. Thus, the only option is to de-referencing xdp_frame. It is only the ixgbe driver that had an optimization, in which it can avoid doing the de-reference of xdp_frame. The driver already have TX-ring queue, which (in case of remote DMA-TX completion) have to be transferred between CPUs anyhow. In this data area, we stored a struct xdp_mem_info and a data pointer, which allowed us to avoid de-referencing xdp_frame. To compensate for this, a prefetchw is used for telling the cache coherency protocol about our access pattern. My benchmarks show that this prefetchw is enough to compensate the ixgbe driver. V7: Adjust for commit d9314c474d4f ("i40e: add support for XDP_REDIRECT") V8: Adjust for commit bd658dda4237 ("net/mlx5e: Separate dma base address and offset in dma_sync call") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller --- kernel/bpf/cpumap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel/bpf') diff --git a/kernel/bpf/cpumap.c b/kernel/bpf/cpumap.c index bcdc4dea5ce7..c95b04ec103e 100644 --- a/kernel/bpf/cpumap.c +++ b/kernel/bpf/cpumap.c @@ -219,7 +219,7 @@ static void __cpu_map_ring_cleanup(struct ptr_ring *ring) while ((xdpf = ptr_ring_consume(ring))) if (WARN_ON_ONCE(xdpf)) - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); } static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu) @@ -275,7 +275,7 @@ static int cpu_map_kthread_run(void *data) skb = cpu_map_build_skb(rcpu, xdpf); if (!skb) { - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); continue; } @@ -578,7 +578,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu, err = __ptr_ring_produce(q, xdpf); if (err) { drops++; - xdp_return_frame(xdpf->data, &xdpf->mem); + xdp_return_frame(xdpf); } processed++; } -- cgit v1.2.1