From bc235cdb423a2daed6f337676006a66557429cd1 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 25 Feb 2021 15:43:15 -0800 Subject: bpf: Prevent deadlock from recursive bpf_task_storage_[get|delete] BPF helpers bpf_task_storage_[get|delete] could hold two locks: bpf_local_storage_map_bucket->lock and bpf_local_storage->lock. Calling these helpers from fentry/fexit programs on functions in bpf_*_storage.c may cause deadlock on either locks. Prevent such deadlock with a per cpu counter, bpf_task_storage_busy. We need this counter to be global, because the two locks here belong to two different objects: bpf_local_storage_map and bpf_local_storage. If we pick one of them as the owner of the counter, it is still possible to trigger deadlock on the other lock. For example, if bpf_local_storage_map owns the counters, it cannot prevent deadlock on bpf_local_storage->lock when two maps are used. Signed-off-by: Song Liu Signed-off-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210225234319.336131-3-songliubraving@fb.com --- net/core/bpf_sk_storage.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c index 4edd033e899c..cc3712ad8716 100644 --- a/net/core/bpf_sk_storage.c +++ b/net/core/bpf_sk_storage.c @@ -89,7 +89,7 @@ static void bpf_sk_storage_map_free(struct bpf_map *map) smap = (struct bpf_local_storage_map *)map; bpf_local_storage_cache_idx_free(&sk_cache, smap->cache_idx); - bpf_local_storage_map_free(smap); + bpf_local_storage_map_free(smap, NULL); } static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr) -- cgit v1.2.1 From 887596095ec2a9ea39ffcf98f27bf2e77c5eb512 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:26 -0800 Subject: bpf: Clean up sockmap related Kconfigs As suggested by John, clean up sockmap related Kconfigs: Reduce the scope of CONFIG_BPF_STREAM_PARSER down to TCP stream parser, to reflect its name. Make the rest sockmap code simply depend on CONFIG_BPF_SYSCALL and CONFIG_INET, the latter is still needed at this point because of TCP/UDP proto update. And leave CONFIG_NET_SOCK_MSG untouched, as it is used by non-sockmap cases. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-2-xiyou.wangcong@gmail.com --- net/core/Makefile | 6 ++- net/core/skmsg.c | 145 ++++++++++++++++++++++++++++------------------------ net/core/sock_map.c | 2 + 3 files changed, 85 insertions(+), 68 deletions(-) (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 3e2c378e5f31..0c2233c826fd 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -16,7 +16,6 @@ obj-y += dev.o dev_addr_lists.o dst.o netevent.o \ obj-y += net-sysfs.o obj-$(CONFIG_PAGE_POOL) += page_pool.o obj-$(CONFIG_PROC_FS) += net-procfs.o -obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_NET_PKTGEN) += pktgen.o obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o @@ -28,10 +27,13 @@ obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o obj-$(CONFIG_LWTUNNEL) += lwtunnel.o obj-$(CONFIG_LWTUNNEL_BPF) += lwt_bpf.o -obj-$(CONFIG_BPF_STREAM_PARSER) += sock_map.o obj-$(CONFIG_DST_CACHE) += dst_cache.o obj-$(CONFIG_HWBM) += hwbm.o obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o +ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o +obj-$(CONFIG_BPF_SYSCALL) += sock_map.o +endif obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 1261512d6807..e017744111e1 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -645,15 +645,15 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +static void sk_psock_done_strp(struct sk_psock *psock); + static void sk_psock_destroy_deferred(struct work_struct *gc) { struct sk_psock *psock = container_of(gc, struct sk_psock, gc); /* No sk_callback_lock since already detached. */ - /* Parser has been stopped */ - if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + sk_psock_done_strp(psock); cancel_work_sync(&psock->work); @@ -750,14 +750,6 @@ static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, return bpf_prog_run_pin_on_cpu(prog, skb); } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -866,6 +858,24 @@ out_free: } } +static void sk_psock_write_space(struct sock *sk) +{ + struct sk_psock *psock; + void (*write_space)(struct sock *sk) = NULL; + + rcu_read_lock(); + psock = sk_psock(sk); + if (likely(psock)) { + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) + schedule_work(&psock->work); + write_space = psock->saved_write_space; + } + rcu_read_unlock(); + if (write_space) + write_space(sk); +} + +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock; @@ -897,6 +907,14 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } +static struct sk_psock *sk_psock_from_strp(struct strparser *strp) +{ + struct sk_psock_parser *parser; + + parser = container_of(strp, struct sk_psock_parser, strp); + return container_of(parser, struct sk_psock, parser); +} + static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { struct sk_psock *psock = sk_psock_from_strp(strp); @@ -933,6 +951,56 @@ static void sk_psock_strp_data_ready(struct sock *sk) rcu_read_unlock(); } +int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) +{ + static const struct strp_callbacks cb = { + .rcv_msg = sk_psock_strp_read, + .read_sock_done = sk_psock_strp_read_done, + .parse_msg = sk_psock_strp_parse, + }; + + psock->parser.enabled = false; + return strp_init(&psock->parser.strp, sk, &cb); +} + +void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (parser->enabled) + return; + + parser->saved_data_ready = sk->sk_data_ready; + sk->sk_data_ready = sk_psock_strp_data_ready; + sk->sk_write_space = sk_psock_write_space; + parser->enabled = true; +} + +void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) +{ + struct sk_psock_parser *parser = &psock->parser; + + if (!parser->enabled) + return; + + sk->sk_data_ready = parser->saved_data_ready; + parser->saved_data_ready = NULL; + strp_stop(&parser->strp); + parser->enabled = false; +} + +static void sk_psock_done_strp(struct sk_psock *psock) +{ + /* Parser has been stopped */ + if (psock->progs.skb_parser) + strp_done(&psock->parser.strp); +} +#else +static void sk_psock_done_strp(struct sk_psock *psock) +{ +} +#endif /* CONFIG_BPF_STREAM_PARSER */ + static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, unsigned int offset, size_t orig_len) { @@ -984,35 +1052,6 @@ static void sk_psock_verdict_data_ready(struct sock *sk) sock->ops->read_sock(sk, &desc, sk_psock_verdict_recv); } -static void sk_psock_write_space(struct sock *sk) -{ - struct sk_psock *psock; - void (*write_space)(struct sock *sk) = NULL; - - rcu_read_lock(); - psock = sk_psock(sk); - if (likely(psock)) { - if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) - schedule_work(&psock->work); - write_space = psock->saved_write_space; - } - rcu_read_unlock(); - if (write_space) - write_space(sk); -} - -int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) -{ - static const struct strp_callbacks cb = { - .rcv_msg = sk_psock_strp_read, - .read_sock_done = sk_psock_strp_read_done, - .parse_msg = sk_psock_strp_parse, - }; - - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); -} - void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; @@ -1026,32 +1065,6 @@ void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) parser->enabled = true; } -void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) - return; - - parser->saved_data_ready = sk->sk_data_ready; - sk->sk_data_ready = sk_psock_strp_data_ready; - sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; -} - -void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) -{ - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) - return; - - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; -} - void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { struct sk_psock_parser *parser = &psock->parser; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d758fb83c884..ee3334dd3a38 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1461,9 +1461,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, case BPF_SK_MSG_VERDICT: pprog = &progs->msg_parser; break; +#if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: pprog = &progs->skb_parser; break; +#endif case BPF_SK_SKB_STREAM_VERDICT: pprog = &progs->skb_verdict; break; -- cgit v1.2.1 From 5a685cd94b21a88efa6be77169eddef525368034 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:27 -0800 Subject: skmsg: Get rid of struct sk_psock_parser struct sk_psock_parser is embedded in sk_psock, it is unnecessary as skb verdict also uses ->saved_data_ready. We can simply fold these fields into sk_psock, and get rid of ->enabled. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-3-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 53 ++++++++++++++++------------------------------------- net/core/sock_map.c | 8 ++++---- 2 files changed, 20 insertions(+), 41 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index e017744111e1..d00c9a4b47e7 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -907,17 +907,9 @@ static int sk_psock_strp_read_done(struct strparser *strp, int err) return err; } -static struct sk_psock *sk_psock_from_strp(struct strparser *strp) -{ - struct sk_psock_parser *parser; - - parser = container_of(strp, struct sk_psock_parser, strp); - return container_of(parser, struct sk_psock, parser); -} - static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) { - struct sk_psock *psock = sk_psock_from_strp(strp); + struct sk_psock *psock = container_of(strp, struct sk_psock, strp); struct bpf_prog *prog; int ret = skb->len; @@ -941,10 +933,10 @@ static void sk_psock_strp_data_ready(struct sock *sk) psock = sk_psock(sk); if (likely(psock)) { if (tls_sw_has_ctx_rx(sk)) { - psock->parser.saved_data_ready(sk); + psock->saved_data_ready(sk); } else { write_lock_bh(&sk->sk_callback_lock); - strp_data_ready(&psock->parser.strp); + strp_data_ready(&psock->strp); write_unlock_bh(&sk->sk_callback_lock); } } @@ -959,41 +951,34 @@ int sk_psock_init_strp(struct sock *sk, struct sk_psock *psock) .parse_msg = sk_psock_strp_parse, }; - psock->parser.enabled = false; - return strp_init(&psock->parser.strp, sk, &cb); + return strp_init(&psock->strp, sk, &cb); } void sk_psock_start_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_strp_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - strp_stop(&parser->strp); - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; + strp_stop(&psock->strp); } static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ if (psock->progs.skb_parser) - strp_done(&psock->parser.strp); + strp_done(&psock->strp); } #else static void sk_psock_done_strp(struct sk_psock *psock) @@ -1054,25 +1039,19 @@ static void sk_psock_verdict_data_ready(struct sock *sk) void sk_psock_start_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (parser->enabled) + if (psock->saved_data_ready) return; - parser->saved_data_ready = sk->sk_data_ready; + psock->saved_data_ready = sk->sk_data_ready; sk->sk_data_ready = sk_psock_verdict_data_ready; sk->sk_write_space = sk_psock_write_space; - parser->enabled = true; } void sk_psock_stop_verdict(struct sock *sk, struct sk_psock *psock) { - struct sk_psock_parser *parser = &psock->parser; - - if (!parser->enabled) + if (!psock->saved_data_ready) return; - sk->sk_data_ready = parser->saved_data_ready; - parser->saved_data_ready = NULL; - parser->enabled = false; + sk->sk_data_ready = psock->saved_data_ready; + psock->saved_data_ready = NULL; } diff --git a/net/core/sock_map.c b/net/core/sock_map.c index ee3334dd3a38..1a28a5c2c61e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->parser.enabled && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.skb_parser) strp_stop = true; - if (psock->parser.enabled && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.skb_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -283,14 +283,14 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->parser.enabled) { + if (skb_parser && skb_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; psock_set_prog(&psock->progs.skb_verdict, skb_verdict); psock_set_prog(&psock->progs.skb_parser, skb_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->parser.enabled) { + } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.skb_verdict, skb_verdict); sk_psock_start_verdict(sk,psock); } -- cgit v1.2.1 From 16137b09a66f2b75090f1e56a9ba0e27ef845ebc Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:28 -0800 Subject: bpf: Compute data_end dynamically with JIT code Currently, we compute ->data_end with a compile-time constant offset of skb. But as Jakub pointed out, we can actually compute it in eBPF JIT code at run-time, so that we can competely get rid of ->data_end. This is similar to skb_shinfo(skb) computation in bpf_convert_shinfo_access(). Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-4-xiyou.wangcong@gmail.com --- net/core/filter.c | 48 ++++++++++++++++++++++++++++-------------------- net/core/skmsg.c | 1 - 2 files changed, 28 insertions(+), 21 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index adfdad234674..13bcf248ee7b 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1863,10 +1863,7 @@ static const struct bpf_func_proto bpf_sk_fullsock_proto = { static inline int sk_skb_try_make_writable(struct sk_buff *skb, unsigned int write_len) { - int err = __bpf_try_make_writable(skb, write_len); - - bpf_compute_data_end_sk_skb(skb); - return err; + return __bpf_try_make_writable(skb, write_len); } BPF_CALL_2(sk_skb_pull_data, struct sk_buff *, skb, u32, len) @@ -3577,7 +3574,6 @@ BPF_CALL_4(sk_skb_adjust_room, struct sk_buff *, skb, s32, len_diff, return -ENOMEM; __skb_pull(skb, len_diff_abs); } - bpf_compute_data_end_sk_skb(skb); if (tls_sw_has_ctx_rx(skb->sk)) { struct strp_msg *rxm = strp_msg(skb); @@ -3742,10 +3738,7 @@ static const struct bpf_func_proto bpf_skb_change_tail_proto = { BPF_CALL_3(sk_skb_change_tail, struct sk_buff *, skb, u32, new_len, u64, flags) { - int ret = __bpf_skb_change_tail(skb, new_len, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_tail(skb, new_len, flags); } static const struct bpf_func_proto sk_skb_change_tail_proto = { @@ -3808,10 +3801,7 @@ static const struct bpf_func_proto bpf_skb_change_head_proto = { BPF_CALL_3(sk_skb_change_head, struct sk_buff *, skb, u32, head_room, u64, flags) { - int ret = __bpf_skb_change_head(skb, head_room, flags); - - bpf_compute_data_end_sk_skb(skb); - return ret; + return __bpf_skb_change_head(skb, head_room, flags); } static const struct bpf_func_proto sk_skb_change_head_proto = { @@ -9655,22 +9645,40 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, return insn - insn_buf; } +/* data_end = skb->data + skb_headlen() */ +static struct bpf_insn *bpf_convert_data_end_access(const struct bpf_insn *si, + struct bpf_insn *insn) +{ + /* si->dst_reg = skb->data */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data), + si->dst_reg, si->src_reg, + offsetof(struct sk_buff, data)); + /* AX = skb->len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, len)); + /* si->dst_reg = skb->data + skb->len */ + *insn++ = BPF_ALU64_REG(BPF_ADD, si->dst_reg, BPF_REG_AX); + /* AX = skb->data_len */ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct sk_buff, data_len), + BPF_REG_AX, si->src_reg, + offsetof(struct sk_buff, data_len)); + /* si->dst_reg = skb->data + skb->len - skb->data_len */ + *insn++ = BPF_ALU64_REG(BPF_SUB, si->dst_reg, BPF_REG_AX); + + return insn; +} + static u32 sk_skb_convert_ctx_access(enum bpf_access_type type, const struct bpf_insn *si, struct bpf_insn *insn_buf, struct bpf_prog *prog, u32 *target_size) { struct bpf_insn *insn = insn_buf; - int off; switch (si->off) { case offsetof(struct __sk_buff, data_end): - off = si->off; - off -= offsetof(struct __sk_buff, data_end); - off += offsetof(struct sk_buff, cb); - off += offsetof(struct tcp_skb_cb, bpf.data_end); - *insn++ = BPF_LDX_MEM(BPF_SIZEOF(void *), si->dst_reg, - si->src_reg, off); + insn = bpf_convert_data_end_access(si, insn); break; default: return bpf_convert_ctx_access(type, si, insn_buf, prog, diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d00c9a4b47e7..8822001ab3dc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -746,7 +746,6 @@ EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, struct sk_buff *skb) { - bpf_compute_data_end_sk_skb(skb); return bpf_prog_run_pin_on_cpu(prog, skb); } -- cgit v1.2.1 From e3526bb92a2084cdaec6cb2855bcec98b280426c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:29 -0800 Subject: skmsg: Move sk_redir from TCP_SKB_CB to skb Currently TCP_SKB_CB() is hard-coded in skmsg code, it certainly does not work for any other non-TCP protocols. We can move them to skb ext, but it introduces a memory allocation on fast path. Fortunately, we only need to a word-size to store all the information, because the flags actually only contains 1 bit so can be just packed into the lowest bit of the "pointer", which is stored as unsigned long. Inside struct sk_buff, '_skb_refdst' can be reused because skb dst is no longer needed after ->sk_data_ready() so we can just drop it. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-5-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 31 +++++++++++++++++++------------ net/core/sock_map.c | 8 ++------ 2 files changed, 21 insertions(+), 18 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 8822001ab3dc..409258367bea 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -525,7 +525,8 @@ static void sk_psock_backlog(struct work_struct *work) len = skb->len; off = 0; start: - ingress = tcp_skb_bpf_ingress(skb); + ingress = skb_bpf_ingress(skb); + skb_bpf_redirect_clear(skb); do { ret = -EIO; if (likely(psock->sk->sk_socket)) @@ -631,7 +632,12 @@ void __sk_psock_purge_ingress_msg(struct sk_psock *psock) static void sk_psock_zap_ingress(struct sk_psock *psock) { - __skb_queue_purge(&psock->ingress_skb); + struct sk_buff *skb; + + while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + skb_bpf_redirect_clear(skb); + kfree_skb(skb); + } __sk_psock_purge_ingress_msg(psock); } @@ -754,7 +760,7 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) struct sk_psock *psock_other; struct sock *sk_other; - sk_other = tcp_skb_bpf_redirect_fetch(skb); + sk_other = skb_bpf_redirect_fetch(skb); /* This error is a buggy BPF program, it returned a redirect * return code, but then didn't set a redirect interface. */ @@ -804,9 +810,10 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) * TLS context. */ skb->sk = psock->sk; - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } sk_psock_tls_verdict_apply(skb, psock->sk, ret); @@ -818,7 +825,6 @@ EXPORT_SYMBOL_GPL(sk_psock_tls_strp_read); static void sk_psock_verdict_apply(struct sk_psock *psock, struct sk_buff *skb, int verdict) { - struct tcp_skb_cb *tcp; struct sock *sk_other; int err = -EIO; @@ -830,8 +836,7 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, goto out_free; } - tcp = TCP_SKB_CB(skb); - tcp->bpf.flags |= BPF_F_INGRESS; + skb_bpf_set_ingress(skb); /* If the queue is empty then we can submit directly * into the msg queue. If its not empty we have to @@ -892,9 +897,10 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: @@ -1011,9 +1017,10 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { - tcp_skb_bpf_redirect_clear(skb); + skb_dst_drop(skb); + skb_bpf_redirect_clear(skb); ret = sk_psock_bpf_run(psock, prog, skb); - ret = sk_psock_map_verd(ret, tcp_skb_bpf_redirect_fetch(skb)); + ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); out: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 1a28a5c2c61e..dbfcd7006338 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -657,7 +657,6 @@ const struct bpf_func_proto bpf_sock_map_update_proto = { BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, struct bpf_map *, map, u32, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -667,8 +666,7 @@ BPF_CALL_4(bpf_sk_redirect_map, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } @@ -1250,7 +1248,6 @@ const struct bpf_func_proto bpf_sock_hash_update_proto = { BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, struct bpf_map *, map, void *, key, u64, flags) { - struct tcp_skb_cb *tcb = TCP_SKB_CB(skb); struct sock *sk; if (unlikely(flags & ~(BPF_F_INGRESS))) @@ -1260,8 +1257,7 @@ BPF_CALL_4(bpf_sk_redirect_hash, struct sk_buff *, skb, if (unlikely(!sk || !sock_map_redirect_allowed(sk))) return SK_DROP; - tcb->bpf.flags = flags; - tcb->bpf.sk_redir = sk; + skb_bpf_set_redir(skb, sk, flags & BPF_F_INGRESS); return SK_PASS; } -- cgit v1.2.1 From ae8b8332fbb512f53bf50ff6a7586dd0f90ed18a Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:30 -0800 Subject: sock_map: Rename skb_parser and skb_verdict These two eBPF programs are tied to BPF_SK_SKB_STREAM_PARSER and BPF_SK_SKB_STREAM_VERDICT, rename them to reflect the fact they are only used for TCP. And save the name 'skb_verdict' for general use later. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Lorenz Bauer Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-6-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 14 ++++++------- net/core/sock_map.c | 60 ++++++++++++++++++++++++++--------------------------- 2 files changed, 37 insertions(+), 37 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 409258367bea..35f9caa3b125 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -691,9 +691,9 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); rcu_assign_sk_user_data(sk, NULL); - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.skb_verdict) + else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); @@ -803,7 +803,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) int ret = __SK_PASS; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { /* We skip full set_owner_r here because if we do a SK_PASS * or SK_DROP we can skip skb memory accounting and use the @@ -895,7 +895,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); @@ -919,7 +919,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) int ret = skb->len; rcu_read_lock(); - prog = READ_ONCE(psock->progs.skb_parser); + prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; ret = sk_psock_bpf_run(psock, prog, skb); @@ -982,7 +982,7 @@ void sk_psock_stop_strp(struct sock *sk, struct sk_psock *psock) static void sk_psock_done_strp(struct sk_psock *psock) { /* Parser has been stopped */ - if (psock->progs.skb_parser) + if (psock->progs.stream_parser) strp_done(&psock->strp); } #else @@ -1015,7 +1015,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, goto out; } skb_set_owner_r(skb, sk); - prog = READ_ONCE(psock->progs.skb_verdict); + prog = READ_ONCE(psock->progs.stream_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dbfcd7006338..69785070f02d 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -148,9 +148,9 @@ static void sock_map_del_link(struct sock *sk, struct bpf_map *map = link->map; struct bpf_stab *stab = container_of(map, struct bpf_stab, map); - if (psock->saved_data_ready && stab->progs.skb_parser) + if (psock->saved_data_ready && stab->progs.stream_parser) strp_stop = true; - if (psock->saved_data_ready && stab->progs.skb_verdict) + if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); @@ -224,23 +224,23 @@ out: static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, struct sock *sk) { - struct bpf_prog *msg_parser, *skb_parser, *skb_verdict; + struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock *psock; int ret; - skb_verdict = READ_ONCE(progs->skb_verdict); - if (skb_verdict) { - skb_verdict = bpf_prog_inc_not_zero(skb_verdict); - if (IS_ERR(skb_verdict)) - return PTR_ERR(skb_verdict); + stream_verdict = READ_ONCE(progs->stream_verdict); + if (stream_verdict) { + stream_verdict = bpf_prog_inc_not_zero(stream_verdict); + if (IS_ERR(stream_verdict)) + return PTR_ERR(stream_verdict); } - skb_parser = READ_ONCE(progs->skb_parser); - if (skb_parser) { - skb_parser = bpf_prog_inc_not_zero(skb_parser); - if (IS_ERR(skb_parser)) { - ret = PTR_ERR(skb_parser); - goto out_put_skb_verdict; + stream_parser = READ_ONCE(progs->stream_parser); + if (stream_parser) { + stream_parser = bpf_prog_inc_not_zero(stream_parser); + if (IS_ERR(stream_parser)) { + ret = PTR_ERR(stream_parser); + goto out_put_stream_verdict; } } @@ -249,7 +249,7 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, msg_parser = bpf_prog_inc_not_zero(msg_parser); if (IS_ERR(msg_parser)) { ret = PTR_ERR(msg_parser); - goto out_put_skb_parser; + goto out_put_stream_parser; } } @@ -261,8 +261,8 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || - (skb_parser && READ_ONCE(psock->progs.skb_parser)) || - (skb_verdict && READ_ONCE(psock->progs.skb_verdict))) { + (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; goto out_progs; @@ -283,15 +283,15 @@ static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, goto out_drop; write_lock_bh(&sk->sk_callback_lock); - if (skb_parser && skb_verdict && !psock->saved_data_ready) { + if (stream_parser && stream_verdict && !psock->saved_data_ready) { ret = sk_psock_init_strp(sk, psock); if (ret) goto out_unlock_drop; - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); - psock_set_prog(&psock->progs.skb_parser, skb_parser); + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); + psock_set_prog(&psock->progs.stream_parser, stream_parser); sk_psock_start_strp(sk, psock); - } else if (!skb_parser && skb_verdict && !psock->saved_data_ready) { - psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); } write_unlock_bh(&sk->sk_callback_lock); @@ -303,12 +303,12 @@ out_drop: out_progs: if (msg_parser) bpf_prog_put(msg_parser); -out_put_skb_parser: - if (skb_parser) - bpf_prog_put(skb_parser); -out_put_skb_verdict: - if (skb_verdict) - bpf_prog_put(skb_verdict); +out_put_stream_parser: + if (stream_parser) + bpf_prog_put(stream_parser); +out_put_stream_verdict: + if (stream_verdict) + bpf_prog_put(stream_verdict); return ret; } @@ -1459,11 +1459,11 @@ int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #if IS_ENABLED(CONFIG_BPF_STREAM_PARSER) case BPF_SK_SKB_STREAM_PARSER: - pprog = &progs->skb_parser; + pprog = &progs->stream_parser; break; #endif case BPF_SK_SKB_STREAM_VERDICT: - pprog = &progs->skb_verdict; + pprog = &progs->stream_verdict; break; default: return -EOPNOTSUPP; -- cgit v1.2.1 From 4675e234b9e15159894b90ead9340e1dc202b670 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:31 -0800 Subject: sock_map: Make sock_map_prog_update() static It is only used within sock_map.c so can become static. Suggested-by: Jakub Sitnicki Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-7-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 69785070f02d..dd53a7771d7e 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -24,6 +24,9 @@ struct bpf_stab { #define SOCK_CREATE_FLAG_MASK \ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which); + static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { struct bpf_stab *stab; @@ -1444,8 +1447,8 @@ static struct sk_psock_progs *sock_map_progs(struct bpf_map *map) return NULL; } -int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, - struct bpf_prog *old, u32 which) +static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, + struct bpf_prog *old, u32 which) { struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog **pprog; -- cgit v1.2.1 From cd81cefb1abc52bd164f4d9760cd22eadc0e4468 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:32 -0800 Subject: skmsg: Make __sk_psock_purge_ingress_msg() static It is only used within skmsg.c so can become static. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-8-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 35f9caa3b125..46e29d2c0c48 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -619,7 +619,7 @@ struct sk_psock_link *sk_psock_link_pop(struct sk_psock *psock) return link; } -void __sk_psock_purge_ingress_msg(struct sk_psock *psock) +static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) { struct sk_msg *msg, *tmp; -- cgit v1.2.1 From 533342322276b06b4db260c413ce907238851e9b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 23 Feb 2021 10:49:33 -0800 Subject: skmsg: Get rid of sk_psock_bpf_run() It is now nearly identical to bpf_prog_run_pin_on_cpu() and it has an unused parameter 'psock', so we can just get rid of it and call bpf_prog_run_pin_on_cpu() directly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210223184934.6054-9-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 46e29d2c0c48..07f54015238a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -749,12 +749,6 @@ out: } EXPORT_SYMBOL_GPL(sk_psock_msg_verdict); -static int sk_psock_bpf_run(struct sk_psock *psock, struct bpf_prog *prog, - struct sk_buff *skb) -{ - return bpf_prog_run_pin_on_cpu(prog, skb); -} - static void sk_psock_skb_redirect(struct sk_buff *skb) { struct sk_psock *psock_other; @@ -812,7 +806,7 @@ int sk_psock_tls_strp_read(struct sk_psock *psock, struct sk_buff *skb) skb->sk = psock->sk; skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); skb->sk = NULL; } @@ -899,7 +893,7 @@ static void sk_psock_strp_read(struct strparser *strp, struct sk_buff *skb) if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); @@ -922,7 +916,7 @@ static int sk_psock_strp_parse(struct strparser *strp, struct sk_buff *skb) prog = READ_ONCE(psock->progs.stream_parser); if (likely(prog)) { skb->sk = psock->sk; - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); skb->sk = NULL; } rcu_read_unlock(); @@ -1019,7 +1013,7 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); - ret = sk_psock_bpf_run(psock, prog, skb); + ret = bpf_prog_run_pin_on_cpu(prog, skb); ret = sk_psock_map_verd(ret, skb_bpf_redirect_fetch(skb)); } sk_psock_verdict_apply(psock, skb, ret); -- cgit v1.2.1 From 7c32e8f8bc33a5f4b113a630857e46634e3e143b Mon Sep 17 00:00:00 2001 From: Lorenz Bauer Date: Wed, 3 Mar 2021 10:18:13 +0000 Subject: bpf: Add PROG_TEST_RUN support for sk_lookup programs Allow to pass sk_lookup programs to PROG_TEST_RUN. User space provides the full bpf_sk_lookup struct as context. Since the context includes a socket pointer that can't be exposed to user space we define that PROG_TEST_RUN returns the cookie of the selected socket or zero in place of the socket pointer. We don't support testing programs that select a reuseport socket, since this would mean running another (unrelated) BPF program from the sk_lookup test handler. Signed-off-by: Lorenz Bauer Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210303101816.36774-3-lmb@cloudflare.com --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 13bcf248ee7b..a526db494c62 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -10457,6 +10457,7 @@ static u32 sk_lookup_convert_ctx_access(enum bpf_access_type type, } const struct bpf_prog_ops sk_lookup_prog_ops = { + .test_run = bpf_prog_test_run_sk_lookup, }; const struct bpf_verifier_ops sk_lookup_verifier_ops = { -- cgit v1.2.1 From d01b59c9ae94560fbcceaafeef39784d72765033 Mon Sep 17 00:00:00 2001 From: Xuesen Huang Date: Thu, 4 Mar 2021 14:40:46 +0800 Subject: bpf: Add bpf_skb_adjust_room flag BPF_F_ADJ_ROOM_ENCAP_L2_ETH bpf_skb_adjust_room sets the inner_protocol as skb->protocol for packets encapsulation. But that is not appropriate when pushing Ethernet header. Add an option to further specify encap L2 type and set the inner_protocol as ETH_P_TEB. Suggested-by: Willem de Bruijn Signed-off-by: Xuesen Huang Signed-off-by: Zhiyong Cheng Signed-off-by: Li Wang Signed-off-by: Daniel Borkmann Acked-by: Willem de Bruijn Link: https://lore.kernel.org/bpf/20210304064046.6232-1-hxseverything@gmail.com --- net/core/filter.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index a526db494c62..588b19ba0da8 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3409,6 +3409,7 @@ static u32 bpf_skb_net_base_len(const struct sk_buff *skb) BPF_F_ADJ_ROOM_ENCAP_L3_MASK | \ BPF_F_ADJ_ROOM_ENCAP_L4_GRE | \ BPF_F_ADJ_ROOM_ENCAP_L4_UDP | \ + BPF_F_ADJ_ROOM_ENCAP_L2_ETH | \ BPF_F_ADJ_ROOM_ENCAP_L2( \ BPF_ADJ_ROOM_ENCAP_L2_MASK)) @@ -3445,6 +3446,10 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, flags & BPF_F_ADJ_ROOM_ENCAP_L4_UDP) return -EINVAL; + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH && + inner_mac_len < ETH_HLEN) + return -EINVAL; + if (skb->encapsulation) return -EALREADY; @@ -3463,7 +3468,11 @@ static int bpf_skb_net_grow(struct sk_buff *skb, u32 off, u32 len_diff, skb->inner_mac_header = inner_net - inner_mac_len; skb->inner_network_header = inner_net; skb->inner_transport_header = inner_trans; - skb_set_inner_protocol(skb, skb->protocol); + + if (flags & BPF_F_ADJ_ROOM_ENCAP_L2_ETH) + skb_set_inner_protocol(skb, htons(ETH_P_TEB)); + else + skb_set_inner_protocol(skb, skb->protocol); skb->encapsulation = 1; skb_set_network_header(skb, mac_len); -- cgit v1.2.1 From e6a4750ffe9d701c4d55212b14b615e63571d235 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 8 Mar 2021 12:29:06 +0100 Subject: bpf, xdp: Make bpf_redirect_map() a map operation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently the bpf_redirect_map() implementation dispatches to the correct map-lookup function via a switch-statement. To avoid the dispatching, this change adds bpf_redirect_map() as a map operation. Each map provides its bpf_redirect_map() version, and correct function is automatically selected by the BPF verifier. A nice side-effect of the code movement is that the map lookup functions are now local to the map implementation files, which removes one additional function call. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210308112907.559576-2-bjorn.topel@gmail.com --- net/core/filter.c | 39 +-------------------------------------- 1 file changed, 1 insertion(+), 38 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 588b19ba0da8..183b0aa6b027 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3943,22 +3943,6 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -static inline void *__xdp_map_lookup_elem(struct bpf_map *map, u32 index) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - return __dev_map_lookup_elem(map, index); - case BPF_MAP_TYPE_DEVMAP_HASH: - return __dev_map_hash_lookup_elem(map, index); - case BPF_MAP_TYPE_CPUMAP: - return __cpu_map_lookup_elem(map, index); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_lookup_elem(map, index); - default: - return NULL; - } -} - void bpf_clear_redirect_map(struct bpf_map *map) { struct bpf_redirect_info *ri; @@ -4112,28 +4096,7 @@ static const struct bpf_func_proto bpf_xdp_redirect_proto = { BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex, u64, flags) { - struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - - /* Lower bits of the flags are used as return code on lookup failure */ - if (unlikely(flags > XDP_TX)) - return XDP_ABORTED; - - ri->tgt_value = __xdp_map_lookup_elem(map, ifindex); - if (unlikely(!ri->tgt_value)) { - /* If the lookup fails we want to clear out the state in the - * redirect_info struct completely, so that if an eBPF program - * performs multiple lookups, the last one always takes - * precedence. - */ - WRITE_ONCE(ri->map, NULL); - return flags; - } - - ri->flags = flags; - ri->tgt_index = ifindex; - WRITE_ONCE(ri->map, map); - - return XDP_REDIRECT; + return map->ops->map_redirect(map, ifindex, flags); } static const struct bpf_func_proto bpf_xdp_redirect_map_proto = { -- cgit v1.2.1 From ee75aef23afe6e88497151c127c13ed69f41aaa2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20T=C3=B6pel?= Date: Mon, 8 Mar 2021 12:29:07 +0100 Subject: bpf, xdp: Restructure redirect actions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The XDP_REDIRECT implementations for maps and non-maps are fairly similar, but obviously need to take different code paths depending on if the target is using a map or not. Today, the redirect targets for XDP either uses a map, or is based on ifindex. Here, the map type and id are added to bpf_redirect_info, instead of the actual map. Map type, map item/ifindex, and the map_id (if any) is passed to xdp_do_redirect(). For ifindex-based redirect, used by the bpf_redirect() XDP BFP helper, a special map type/id are used. Map type of UNSPEC together with map id equal to INT_MAX has the special meaning of an ifindex based redirect. Note that valid map ids are 1 inclusive, INT_MAX exclusive ([1,INT_MAX[). In addition to making the code easier to follow, using explicit type and id in bpf_redirect_info has a slight positive performance impact by avoiding a pointer indirection for the map type lookup, and instead use the cacheline for bpf_redirect_info. Since the actual map is not passed via bpf_redirect_info anymore, the map lookup is only done in the BPF helper. This means that the bpf_clear_redirect_map() function can be removed. The actual map item is RCU protected. The bpf_redirect_info flags member is not used by XDP, and not read/written any more. The map member is only written to when required/used, and not unconditionally. Signed-off-by: Björn Töpel Signed-off-by: Daniel Borkmann Reviewed-by: Maciej Fijalkowski Acked-by: Jesper Dangaard Brouer Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210308112907.559576-3-bjorn.topel@gmail.com --- net/core/filter.c | 170 ++++++++++++++++++++++++------------------------------ 1 file changed, 75 insertions(+), 95 deletions(-) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 183b0aa6b027..b6732000d8a2 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -3918,23 +3918,6 @@ static const struct bpf_func_proto bpf_xdp_adjust_meta_proto = { .arg2_type = ARG_ANYTHING, }; -static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd, - struct bpf_map *map, struct xdp_buff *xdp) -{ - switch (map->map_type) { - case BPF_MAP_TYPE_DEVMAP: - case BPF_MAP_TYPE_DEVMAP_HASH: - return dev_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_CPUMAP: - return cpu_map_enqueue(fwd, xdp, dev_rx); - case BPF_MAP_TYPE_XSKMAP: - return __xsk_map_redirect(fwd, xdp); - default: - return -EBADRQC; - } - return 0; -} - void xdp_do_flush(void) { __dev_flush(); @@ -3943,55 +3926,52 @@ void xdp_do_flush(void) } EXPORT_SYMBOL_GPL(xdp_do_flush); -void bpf_clear_redirect_map(struct bpf_map *map) -{ - struct bpf_redirect_info *ri; - int cpu; - - for_each_possible_cpu(cpu) { - ri = per_cpu_ptr(&bpf_redirect_info, cpu); - /* Avoid polluting remote cacheline due to writes if - * not needed. Once we pass this test, we need the - * cmpxchg() to make sure it hasn't been changed in - * the meantime by remote CPU. - */ - if (unlikely(READ_ONCE(ri->map) == map)) - cmpxchg(&ri->map, map, NULL); - } -} - int xdp_do_redirect(struct net_device *dev, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; + enum bpf_map_type map_type = ri->map_type; void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; int err; - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - if (unlikely(!map)) { - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_CPUMAP: + err = cpu_map_enqueue(fwd, xdp, dev); + break; + case BPF_MAP_TYPE_XSKMAP: + err = __xsk_map_redirect(fwd, xdp); + break; + case BPF_MAP_TYPE_UNSPEC: + if (map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + break; + } + err = dev_xdp_enqueue(fwd, xdp, dev); + break; } - - err = dev_xdp_enqueue(fwd, xdp, dev); - } else { - err = __bpf_tx_xdp_map(dev, fwd, map, xdp); + fallthrough; + default: + err = -EBADRQC; } if (unlikely(err)) goto err; - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } EXPORT_SYMBOL_GPL(xdp_do_redirect); @@ -4000,41 +3980,36 @@ static int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog, - struct bpf_map *map) + void *fwd, + enum bpf_map_type map_type, u32 map_id) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - u32 index = ri->tgt_index; - void *fwd = ri->tgt_value; - int err = 0; - - ri->tgt_index = 0; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); - - if (map->map_type == BPF_MAP_TYPE_DEVMAP || - map->map_type == BPF_MAP_TYPE_DEVMAP_HASH) { - struct bpf_dtab_netdev *dst = fwd; + int err; - err = dev_map_generic_redirect(dst, skb, xdp_prog); + switch (map_type) { + case BPF_MAP_TYPE_DEVMAP: + fallthrough; + case BPF_MAP_TYPE_DEVMAP_HASH: + err = dev_map_generic_redirect(fwd, skb, xdp_prog); if (unlikely(err)) goto err; - } else if (map->map_type == BPF_MAP_TYPE_XSKMAP) { - struct xdp_sock *xs = fwd; - - err = xsk_generic_rcv(xs, xdp); + break; + case BPF_MAP_TYPE_XSKMAP: + err = xsk_generic_rcv(fwd, xdp); if (err) goto err; consume_skb(skb); - } else { + break; + default: /* TODO: Handle BPF_MAP_TYPE_CPUMAP */ err = -EBADRQC; goto err; } - _trace_xdp_redirect_map(dev, xdp_prog, fwd, map, index); + _trace_xdp_redirect_map(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index); return 0; err: - _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map, index, err); + _trace_xdp_redirect_map_err(dev, xdp_prog, fwd, map_type, map_id, ri->tgt_index, err); return err; } @@ -4042,31 +4017,34 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb, struct xdp_buff *xdp, struct bpf_prog *xdp_prog) { struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); - struct bpf_map *map = READ_ONCE(ri->map); - u32 index = ri->tgt_index; - struct net_device *fwd; - int err = 0; - - if (map) - return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, - map); - ri->tgt_index = 0; - fwd = dev_get_by_index_rcu(dev_net(dev), index); - if (unlikely(!fwd)) { - err = -EINVAL; - goto err; - } + enum bpf_map_type map_type = ri->map_type; + void *fwd = ri->tgt_value; + u32 map_id = ri->map_id; + int err; - err = xdp_ok_fwd_dev(fwd, skb->len); - if (unlikely(err)) - goto err; + ri->map_id = 0; /* Valid map id idr range: [1,INT_MAX[ */ + ri->map_type = BPF_MAP_TYPE_UNSPEC; - skb->dev = fwd; - _trace_xdp_redirect(dev, xdp_prog, index); - generic_xdp_tx(skb, xdp_prog); - return 0; + if (map_type == BPF_MAP_TYPE_UNSPEC && map_id == INT_MAX) { + fwd = dev_get_by_index_rcu(dev_net(dev), ri->tgt_index); + if (unlikely(!fwd)) { + err = -EINVAL; + goto err; + } + + err = xdp_ok_fwd_dev(fwd, skb->len); + if (unlikely(err)) + goto err; + + skb->dev = fwd; + _trace_xdp_redirect(dev, xdp_prog, ri->tgt_index); + generic_xdp_tx(skb, xdp_prog); + return 0; + } + + return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog, fwd, map_type, map_id); err: - _trace_xdp_redirect_err(dev, xdp_prog, index, err); + _trace_xdp_redirect_err(dev, xdp_prog, ri->tgt_index, err); return err; } @@ -4077,10 +4055,12 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags) if (unlikely(flags)) return XDP_ABORTED; - ri->flags = flags; + /* NB! Map type UNSPEC and map_id == INT_MAX (never generated + * by map_idr) is used for ifindex based XDP redirect. + */ ri->tgt_index = ifindex; - ri->tgt_value = NULL; - WRITE_ONCE(ri->map, NULL); + ri->map_id = INT_MAX; + ri->map_type = BPF_MAP_TYPE_UNSPEC; return XDP_REDIRECT; } -- cgit v1.2.1 From b1866bfff9223e4d15727e05a865b744a163eff2 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Tue, 9 Mar 2021 23:42:43 -0600 Subject: net: core: Fix fall-through warnings for Clang In preparation to enable -Wimplicit-fallthrough for Clang, fix a warning by explicitly adding a break statement instead of letting the code fall through to the next case. Link: https://github.com/KSPP/linux/issues/115 Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6c5967e80132..2bfdd528c7c3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5265,6 +5265,7 @@ skip_classify: goto another_round; case RX_HANDLER_EXACT: deliver_exact = true; + break; case RX_HANDLER_PASS: break; default: -- cgit v1.2.1 From 1ddc3229ad3c40840c24a699ada5cfeb4319b578 Mon Sep 17 00:00:00 2001 From: Yunsheng Lin Date: Wed, 10 Mar 2021 16:28:58 +0800 Subject: skbuff: remove some unnecessary operation in skb_segment_list() gro list uses skb_shinfo(skb)->frag_list to link two skb together, and NAPI_GRO_CB(p)->last->next is used when there are more skb, see skb_gro_receive_list(). gso expects that each segmented skb is linked together using skb->next, so only the first skb->next need to set to skb_shinfo(skb)-> frag_list when doing gso list segment. It is the same reason that nskb->next does not need to be set to list_skb before goto the error handling, because nskb->next already pointers to list_skb. And nskb is also the last skb at the end of loop, so remove tail variable and use nskb instead. Signed-off-by: Yunsheng Lin Signed-off-by: David S. Miller --- net/core/skbuff.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c421c8f80925..e8320b5d651a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3732,13 +3732,13 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int tnl_hlen = skb_tnl_header_len(skb); unsigned int delta_truesize = 0; unsigned int delta_len = 0; - struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; int err; skb_push(skb, -skb_network_offset(skb) + offset); skb_shinfo(skb)->frag_list = NULL; + skb->next = list_skb; do { nskb = list_skb; @@ -3756,17 +3756,8 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, } } - if (!tail) - skb->next = nskb; - else - tail->next = nskb; - - if (unlikely(err)) { - nskb->next = list_skb; + if (unlikely(err)) goto err_linearize; - } - - tail = nskb; delta_len += nskb->len; delta_truesize += nskb->truesize; @@ -3793,7 +3784,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_gso_reset(skb); - skb->prev = tail; + skb->prev = nskb; if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) -- cgit v1.2.1 From 0ccf4d50d14f360dfae5b25b8ffcb27f98e591f0 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Mar 2021 20:30:05 +0000 Subject: gro: simplify gro_list_prepare() gro_list_prepare() always returns &napi->gro_hash[bucket].list, without any variations. Moreover, it uses 'napi' argument only to have access to this list, and calculates the bucket index for the second time (firstly it happens at the beginning of dev_gro_receive()) to do that. Given that dev_gro_receive() already has an index to the needed list, just pass it as the first argument to eliminate redundant calculations, and make gro_list_prepare() return void. Also, both arguments of gro_list_prepare() can be constified since this function can only modify the skbs from the bucket list. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 2bfdd528c7c3..1317e6b6758a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5858,15 +5858,13 @@ void napi_gro_flush(struct napi_struct *napi, bool flush_old) } EXPORT_SYMBOL(napi_gro_flush); -static struct list_head *gro_list_prepare(struct napi_struct *napi, - struct sk_buff *skb) +static void gro_list_prepare(const struct list_head *head, + const struct sk_buff *skb) { unsigned int maclen = skb->dev->hard_header_len; u32 hash = skb_get_hash_raw(skb); - struct list_head *head; struct sk_buff *p; - head = &napi->gro_hash[hash & (GRO_HASH_BUCKETS - 1)].list; list_for_each_entry(p, head, list) { unsigned long diffs; @@ -5892,8 +5890,6 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, maclen); NAPI_GRO_CB(p)->same_flow = !diffs; } - - return head; } static void skb_gro_reset_offset(struct sk_buff *skb) @@ -5957,10 +5953,10 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct list_head *gro_head = &napi->gro_hash[hash].list; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; - struct list_head *gro_head; struct sk_buff *pp = NULL; enum gro_result ret; int same_flow; @@ -5969,7 +5965,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_head = gro_list_prepare(napi, skb); + gro_list_prepare(gro_head, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { -- cgit v1.2.1 From 9dc2c313378528afe1bddf12cad88dbfe0998820 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Mar 2021 20:30:10 +0000 Subject: gro: consistentify napi->gro_hash[x] access in dev_gro_receive() GRO bucket index doesn't change through the entire function. Store a pointer to the corresponding bucket instead of its member and use it consistently through the function. It is performance-safe since &gro_list->list == gro_list. Misc: remove superfluous braces around single-line branches. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1317e6b6758a..b635467087f3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5953,7 +5953,7 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct list_head *gro_head = &napi->gro_hash[hash].list; + struct gro_list *gro_list = &napi->gro_hash[hash]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; @@ -5965,7 +5965,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (netif_elide_gro(skb->dev)) goto normal; - gro_list_prepare(gro_head, skb); + gro_list_prepare(&gro_list->list, skb); rcu_read_lock(); list_for_each_entry_rcu(ptype, head, list) { @@ -6001,7 +6001,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff pp = INDIRECT_CALL_INET(ptype->callbacks.gro_receive, ipv6_gro_receive, inet_gro_receive, - gro_head, skb); + &gro_list->list, skb); break; } rcu_read_unlock(); @@ -6020,7 +6020,7 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (pp) { skb_list_del_init(pp); napi_gro_complete(napi, pp); - napi->gro_hash[hash].count--; + gro_list->count--; } if (same_flow) @@ -6029,16 +6029,16 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff if (NAPI_GRO_CB(skb)->flush) goto normal; - if (unlikely(napi->gro_hash[hash].count >= MAX_GRO_SKBS)) { - gro_flush_oldest(napi, gro_head); - } else { - napi->gro_hash[hash].count++; - } + if (unlikely(gro_list->count >= MAX_GRO_SKBS)) + gro_flush_oldest(napi, &gro_list->list); + else + gro_list->count++; + NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; skb_shinfo(skb)->gso_size = skb_gro_len(skb); - list_add(&skb->list, gro_head); + list_add(&skb->list, &gro_list->list); ret = GRO_HELD; pull: @@ -6046,7 +6046,7 @@ pull: if (grow > 0) gro_pull_from_frag0(skb, grow); ok: - if (napi->gro_hash[hash].count) { + if (gro_list->count) { if (!test_bit(hash, &napi->gro_bitmask)) __set_bit(hash, &napi->gro_bitmask); } else if (test_bit(hash, &napi->gro_bitmask)) { -- cgit v1.2.1 From d0eed5c325149002c364a1439ae1afe1992beae4 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sat, 13 Mar 2021 20:30:14 +0000 Subject: gro: give 'hash' variable in dev_gro_receive() a less confusing name 'hash' stores not the flow hash, but the index of the GRO bucket corresponding to it. Change its name to 'bucket' to avoid confusion while reading lines like '__set_bit(hash, &napi->gro_bitmask)'. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b635467087f3..5a2847a19cf2 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5952,8 +5952,8 @@ static void gro_flush_oldest(struct napi_struct *napi, struct list_head *head) static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { - u32 hash = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); - struct gro_list *gro_list = &napi->gro_hash[hash]; + u32 bucket = skb_get_hash_raw(skb) & (GRO_HASH_BUCKETS - 1); + struct gro_list *gro_list = &napi->gro_hash[bucket]; struct list_head *head = &offload_base; struct packet_offload *ptype; __be16 type = skb->protocol; @@ -6047,10 +6047,10 @@ pull: gro_pull_from_frag0(skb, grow); ok: if (gro_list->count) { - if (!test_bit(hash, &napi->gro_bitmask)) - __set_bit(hash, &napi->gro_bitmask); - } else if (test_bit(hash, &napi->gro_bitmask)) { - __clear_bit(hash, &napi->gro_bitmask); + if (!test_bit(bucket, &napi->gro_bitmask)) + __set_bit(bucket, &napi->gro_bitmask); + } else if (test_bit(bucket, &napi->gro_bitmask)) { + __clear_bit(bucket, &napi->gro_bitmask); } return ret; -- cgit v1.2.1 From f96533cded173b3b019001a505a746c3cd8fc323 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Sun, 14 Mar 2021 11:11:23 +0000 Subject: flow_dissector: constify raw input data argument Flow Dissector code never modifies the input buffer, neither skb nor raw data. Make 'data' argument const for all of the Flow dissector's functions. Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 41 ++++++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 19 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 2ef2224b3bff..2ed380d096ce 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -114,7 +114,7 @@ int flow_dissector_bpf_prog_attach_check(struct net *net, * is the protocol port offset returned from proto_ports_offset */ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto, - void *data, int hlen) + const void *data, int hlen) { int poff = proto_ports_offset(ip_proto); @@ -161,7 +161,7 @@ static bool icmp_has_id(u8 type) */ void skb_flow_get_icmp_tci(const struct sk_buff *skb, struct flow_dissector_key_icmp *key_icmp, - void *data, int thoff, int hlen) + const void *data, int thoff, int hlen) { struct icmphdr *ih, _ih; @@ -187,8 +187,8 @@ EXPORT_SYMBOL(skb_flow_get_icmp_tci); */ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, int thoff, int hlen) + void *target_container, const void *data, + int thoff, int hlen) { struct flow_dissector_key_icmp *key_icmp; @@ -409,8 +409,8 @@ EXPORT_SYMBOL(skb_flow_dissect_hash); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen, - int lse_index, bool *entropy_label) + void *target_container, const void *data, int nhoff, + int hlen, int lse_index, bool *entropy_label) { struct mpls_label *hdr, _hdr; u32 entry, label, bos; @@ -467,7 +467,8 @@ __skb_flow_dissect_mpls(const struct sk_buff *skb, static enum flow_dissect_ret __skb_flow_dissect_arp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, int hlen) + void *target_container, const void *data, + int nhoff, int hlen) { struct flow_dissector_key_arp *key_arp; struct { @@ -523,7 +524,7 @@ static enum flow_dissect_ret __skb_flow_dissect_gre(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, struct flow_dissector *flow_dissector, - void *target_container, void *data, + void *target_container, const void *data, __be16 *p_proto, int *p_nhoff, int *p_hlen, unsigned int flags) { @@ -663,8 +664,8 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, static enum flow_dissect_ret __skb_flow_dissect_batadv(const struct sk_buff *skb, struct flow_dissector_key_control *key_control, - void *data, __be16 *p_proto, int *p_nhoff, int hlen, - unsigned int flags) + const void *data, __be16 *p_proto, int *p_nhoff, + int hlen, unsigned int flags) { struct { struct batadv_unicast_packet batadv_unicast; @@ -695,7 +696,8 @@ __skb_flow_dissect_batadv(const struct sk_buff *skb, static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int thoff, int hlen) + void *target_container, const void *data, + int thoff, int hlen) { struct flow_dissector_key_tcp *key_tcp; struct tcphdr *th, _th; @@ -719,8 +721,8 @@ __skb_flow_dissect_tcp(const struct sk_buff *skb, static void __skb_flow_dissect_ports(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, int nhoff, - u8 ip_proto, int hlen) + void *target_container, const void *data, + int nhoff, u8 ip_proto, int hlen) { enum flow_dissector_key_id dissector_ports = FLOW_DISSECTOR_KEY_MAX; struct flow_dissector_key_ports *key_ports; @@ -744,7 +746,8 @@ __skb_flow_dissect_ports(const struct sk_buff *skb, static void __skb_flow_dissect_ipv4(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, const struct iphdr *iph) + void *target_container, const void *data, + const struct iphdr *iph) { struct flow_dissector_key_ip *key_ip; @@ -761,7 +764,8 @@ __skb_flow_dissect_ipv4(const struct sk_buff *skb, static void __skb_flow_dissect_ipv6(const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, void *data, const struct ipv6hdr *iph) + void *target_container, const void *data, + const struct ipv6hdr *iph) { struct flow_dissector_key_ip *key_ip; @@ -908,9 +912,8 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, bool __skb_flow_dissect(const struct net *net, const struct sk_buff *skb, struct flow_dissector *flow_dissector, - void *target_container, - void *data, __be16 proto, int nhoff, int hlen, - unsigned int flags) + void *target_container, const void *data, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct flow_dissector_key_control *key_control; struct flow_dissector_key_basic *key_basic; @@ -1642,7 +1645,7 @@ __u32 skb_get_hash_perturb(const struct sk_buff *skb, } EXPORT_SYMBOL(skb_get_hash_perturb); -u32 __skb_get_poff(const struct sk_buff *skb, void *data, +u32 __skb_get_poff(const struct sk_buff *skb, const void *data, const struct flow_keys_basic *keys, int hlen) { u32 poff = keys->control.thoff; -- cgit v1.2.1 From 6503b9f29a47cdb4ebd6c36d8bbb018418415c2a Mon Sep 17 00:00:00 2001 From: Manu Bretelle Date: Wed, 10 Mar 2021 10:23:05 -0800 Subject: bpf: Add getter and setter for SO_REUSEPORT through bpf_{g,s}etsockopt Augment the current set of options that are accessible via bpf_{g,s}etsockopt to also support SO_REUSEPORT. Signed-off-by: Manu Bretelle Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20210310182305.1910312-1-chantra@fb.com --- net/core/filter.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index b6732000d8a2..10dac9dd5086 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -4729,6 +4729,9 @@ static int _bpf_setsockopt(struct sock *sk, int level, int optname, sk->sk_prot->keepalive(sk, valbool); sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); break; + case SO_REUSEPORT: + sk->sk_reuseport = valbool; + break; default: ret = -EINVAL; } @@ -4898,6 +4901,9 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, case SO_BINDTOIFINDEX: *((int *)optval) = sk->sk_bound_dev_if; break; + case SO_REUSEPORT: + *((int *)optval) = sk->sk_reuseport; + break; default: goto err_clear; } -- cgit v1.2.1 From 8f64860f8b567cc4f8ac854a65cbf6337404c520 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Sun, 14 Mar 2021 15:49:19 +0100 Subject: net: export dev_set_threaded symbol For wireless devices (e.g. mt76 driver) multiple net_devices belongs to the same wireless phy and the napi object is registered in a dummy netdevice related to the wireless phy. Export dev_set_threaded in order to be reused in device drivers enabling threaded NAPI. Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 5a2847a19cf2..6bc20eabd2b0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6766,6 +6766,7 @@ int dev_set_threaded(struct net_device *dev, bool threaded) return err; } +EXPORT_SYMBOL(dev_set_threaded); void netif_napi_add(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) -- cgit v1.2.1 From ea4fe7e842f6c7f972d795a8efc167c4bb33b62f Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:40 +0100 Subject: net-sysfs: convert xps_cpus_show to bitmap_zalloc Use bitmap_zalloc instead of zalloc_cpumask_var in xps_cpus_show to align with xps_rxqs_show. This will improve maintenance and allow us to factorize the two functions. The function should behave the same. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 307628fdf380..3a083c0c9dd3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1367,8 +1367,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - cpumask_var_t mask; - unsigned long index; + unsigned long *mask, index; if (!netif_is_multiqueue(dev)) return -ENOENT; @@ -1396,7 +1395,8 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } } - if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) { + mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL); + if (!mask) { ret = -ENOMEM; goto err_rtnl_unlock; } @@ -1414,7 +1414,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, for (i = map->len; i--;) { if (map->queues[i] == index) { - cpumask_set_cpu(cpu, mask); + set_bit(cpu, mask); break; } } @@ -1424,8 +1424,8 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, rtnl_unlock(); - len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); - free_cpumask_var(mask); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_cpu_ids); + bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; err_rtnl_unlock: -- cgit v1.2.1 From d9a063d207f0e538b0f5aa8b04a6c14f88906a6d Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:41 +0100 Subject: net-sysfs: store the return of get_netdev_queue_index in an unsigned int In net-sysfs, get_netdev_queue_index returns an unsigned int. Some of its callers use an unsigned long to store the returned value. Update the code to be consistent, this should only be cosmetic. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 3a083c0c9dd3..5dc4223f6b68 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1367,7 +1367,8 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, int cpu, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - unsigned long *mask, index; + unsigned long *mask; + unsigned int index; if (!netif_is_multiqueue(dev)) return -ENOENT; @@ -1437,7 +1438,7 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue, const char *buf, size_t len) { struct net_device *dev = queue->dev; - unsigned long index; + unsigned int index; cpumask_var_t mask; int err; @@ -1479,7 +1480,8 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - unsigned long *mask, index; + unsigned long *mask; + unsigned int index; index = get_netdev_queue_index(queue); @@ -1541,7 +1543,8 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, { struct net_device *dev = queue->dev; struct net *net = dev_net(dev); - unsigned long *mask, index; + unsigned long *mask; + unsigned int index; int err; if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) -- cgit v1.2.1 From 73f5e52b15e3aa4ef641264228cd9069b1948149 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:42 +0100 Subject: net-sysfs: make xps_cpus_show and xps_rxqs_show consistent Make the implementations of xps_cpus_show and xps_rxqs_show to converge, as the two share the same logic but diverted over time. This should not modify their behaviour but will help future changes and improve maintenance. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5dc4223f6b68..5f76183ad5bc 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1364,7 +1364,7 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { - int cpu, len, ret, num_tc = 1, tc = 0; + int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; unsigned long *mask; @@ -1404,23 +1404,26 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); - if (dev_maps) { - for_each_possible_cpu(cpu) { - int i, tci = cpu * num_tc + tc; - struct xps_map *map; - - map = rcu_dereference(dev_maps->attr_map[tci]); - if (!map) - continue; - - for (i = map->len; i--;) { - if (map->queues[i] == index) { - set_bit(cpu, mask); - break; - } + if (!dev_maps) + goto out_no_maps; + + for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids), + j < nr_cpu_ids;) { + int i, tci = j * num_tc + tc; + struct xps_map *map; + + map = rcu_dereference(dev_maps->attr_map[tci]); + if (!map) + continue; + + for (i = map->len; i--;) { + if (map->queues[i] == index) { + set_bit(j, mask); + break; } } } +out_no_maps: rcu_read_unlock(); rtnl_unlock(); -- cgit v1.2.1 From 255c04a87f4381849fce9ed81e5efabf78a71a30 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:43 +0100 Subject: net: embed num_tc in the xps maps The xps cpus/rxqs map is accessed using dev->num_tc, which is used when allocating the map. But later updates of dev->num_tc can lead to having a mismatch between the maps and how they're accessed. In such cases the map values do not make any sense and out of bound accesses can occur (that can be easily seen using KASAN). This patch aims at fixing this by embedding num_tc into the maps, using the value at the time the map is created. This brings two improvements: - The maps can be accessed using the embedded num_tc, so we know for sure we won't have out of bound accesses. - Checks can be made before accessing the maps so we know the values retrieved will make sense. We also update __netif_set_xps_queue to conditionally copy old maps from dev_maps in the new one only if the number of traffic classes from both maps match. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 63 ++++++++++++++++++++++++++++++++++------------------ net/core/net-sysfs.c | 45 ++++++++++++++----------------------- 2 files changed, 58 insertions(+), 50 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 6bc20eabd2b0..4e29d1994fdd 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2491,7 +2491,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, struct xps_dev_maps *dev_maps, int cpu, u16 offset, u16 count) { - int num_tc = dev->num_tc ? : 1; + int num_tc = dev_maps->num_tc; bool active = false; int tci; @@ -2634,10 +2634,10 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, { const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; struct xps_map *map, *new_map; - bool active = false; unsigned int nr_ids; if (dev->num_tc) { @@ -2672,19 +2672,29 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (maps_sz < L1_CACHE_BYTES) maps_sz = L1_CACHE_BYTES; + /* The old dev_maps could be larger or smaller than the one we're + * setting up now, as dev->num_tc could have been updated in between. We + * could try to be smart, but let's be safe instead and only copy + * foreign traffic classes if the two map sizes match. + */ + if (dev_maps && dev_maps->num_tc == num_tc) + copy = true; + /* allocate memory for queue storage */ for (j = -1; j = netif_attrmask_next_and(j, online_mask, mask, nr_ids), j < nr_ids;) { - if (!new_dev_maps) - new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); if (!new_dev_maps) { - mutex_unlock(&xps_map_mutex); - return -ENOMEM; + new_dev_maps = kzalloc(maps_sz, GFP_KERNEL); + if (!new_dev_maps) { + mutex_unlock(&xps_map_mutex); + return -ENOMEM; + } + + new_dev_maps->num_tc = num_tc; } tci = j * num_tc + tc; - map = dev_maps ? xmap_dereference(dev_maps->attr_map[tci]) : - NULL; + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; map = expand_xps_map(map, j, index, is_rxqs_map); if (!map) @@ -2706,7 +2716,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = j * num_tc; dev_maps && i--; tci++) { + for (i = tc, tci = j * num_tc; copy && i--; tci++) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); @@ -2736,14 +2746,14 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, numa_node_id = -1; } #endif - } else if (dev_maps) { + } else if (copy) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } /* copy maps belonging to foreign traffic classes */ - for (i = num_tc - tc, tci++; dev_maps && --i; tci++) { + for (i = num_tc - tc, tci++; copy && --i; tci++) { /* fill in the new device map from the old device map */ map = xmap_dereference(dev_maps->attr_map[tci]); RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); @@ -2761,11 +2771,18 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { - for (i = num_tc, tci = j * num_tc; i--; tci++) { - new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); - if (map && map != new_map) - kfree_rcu(map, rcu); + if (!map) + continue; + + if (copy) { + new_map = xmap_dereference(new_dev_maps->attr_map[tci]); + if (map == new_map) + continue; + } + + kfree_rcu(map, rcu); } } @@ -2789,12 +2806,12 @@ out_no_new_maps: /* removes tx-queue from unused CPUs/rx-queues */ for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), j < nr_ids;) { - for (i = tc, tci = j * num_tc; i--; tci++) + for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); if (!netif_attr_test_mask(j, mask, nr_ids) || !netif_attr_test_online(j, online_mask, nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); - for (i = num_tc - tc, tci++; --i; tci++) + for (i = dev_maps->num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); } @@ -2812,7 +2829,7 @@ error: j < nr_ids;) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); - map = dev_maps ? + map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; if (new_map && new_map != map) @@ -3944,13 +3961,15 @@ sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev) static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_dev_maps *dev_maps, unsigned int tci) { + int tc = netdev_get_prio_tc_map(dev, skb->priority); struct xps_map *map; int queue_index = -1; - if (dev->num_tc) { - tci *= dev->num_tc; - tci += netdev_get_prio_tc_map(dev, skb->priority); - } + if (tc >= dev_maps->num_tc) + return queue_index; + + tci *= dev_maps->num_tc; + tci += tc; map = rcu_dereference(dev_maps->attr_map[tci]); if (map) { diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 5f76183ad5bc..1364d0f39cb0 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1364,9 +1364,9 @@ static const struct attribute_group dql_group = { static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) { - int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + int j, len, ret, tc = 0; unsigned long *mask; unsigned int index; @@ -1378,22 +1378,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, if (!rtnl_trylock()) return restart_syscall(); - if (dev->num_tc) { - /* Do not allow XPS on subordinate device directly */ - num_tc = dev->num_tc; - if (num_tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } - - /* If queue belongs to subordinate dev use its map */ - dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; } mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL); @@ -1404,12 +1395,12 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_cpus_map); - if (!dev_maps) + if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids), j < nr_cpu_ids;) { - int i, tci = j * num_tc + tc; + int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); @@ -1480,9 +1471,9 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { - int j, len, ret, num_tc = 1, tc = 0; struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + int j, len, ret, tc = 0; unsigned long *mask; unsigned int index; @@ -1491,14 +1482,12 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) if (!rtnl_trylock()) return restart_syscall(); - if (dev->num_tc) { - num_tc = dev->num_tc; - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + ret = -EINVAL; + goto err_rtnl_unlock; } + mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); if (!mask) { ret = -ENOMEM; @@ -1507,12 +1496,12 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_rxqs_map); - if (!dev_maps) + if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), j < dev->num_rx_queues;) { - int i, tci = j * num_tc + tc; + int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; map = rcu_dereference(dev_maps->attr_map[tci]); -- cgit v1.2.1 From 5478fcd0f48322e04ae6c173ad3a1959e066dc83 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:44 +0100 Subject: net: embed nr_ids in the xps maps Embed nr_ids (the number of cpu for the xps cpus map, and the number of rxqs for the xps cpus map) in dev_maps. That will help not accessing out of bound memory if those values change after dev_maps was allocated. Suggested-by: Alexander Duyck Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 45 +++++++++++++++++++++------------------------ net/core/net-sysfs.c | 38 ++++++++++++++++++++++---------------- 2 files changed, 43 insertions(+), 40 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4e29d1994fdd..7530c95970a0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2524,14 +2524,14 @@ static void reset_xps_maps(struct net_device *dev, } static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, - struct xps_dev_maps *dev_maps, unsigned int nr_ids, - u16 offset, u16 count, bool is_rxqs_map) + struct xps_dev_maps *dev_maps, u16 offset, u16 count, + bool is_rxqs_map) { + unsigned int nr_ids = dev_maps->nr_ids; bool active = false; int i, j; - for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), - j < nr_ids;) + for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), j < nr_ids;) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) @@ -2551,7 +2551,6 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, { const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; - unsigned int nr_ids; if (!static_key_false(&xps_needed)) return; @@ -2561,11 +2560,9 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (static_key_false(&xps_rxqs_needed)) { dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) { - nr_ids = dev->num_rx_queues; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, - offset, count, true); - } + if (dev_maps) + clean_xps_maps(dev, possible_mask, dev_maps, offset, + count, true); } dev_maps = xmap_dereference(dev->xps_cpus_map); @@ -2574,9 +2571,7 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (num_possible_cpus() > 1) possible_mask = cpumask_bits(cpu_possible_mask); - nr_ids = nr_cpu_ids; - clean_xps_maps(dev, possible_mask, dev_maps, nr_ids, offset, count, - false); + clean_xps_maps(dev, possible_mask, dev_maps, offset, count, false); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2673,11 +2668,12 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, maps_sz = L1_CACHE_BYTES; /* The old dev_maps could be larger or smaller than the one we're - * setting up now, as dev->num_tc could have been updated in between. We - * could try to be smart, but let's be safe instead and only copy - * foreign traffic classes if the two map sizes match. + * setting up now, as dev->num_tc or nr_ids could have been updated in + * between. We could try to be smart, but let's be safe instead and only + * copy foreign traffic classes if the two map sizes match. */ - if (dev_maps && dev_maps->num_tc == num_tc) + if (dev_maps && + dev_maps->num_tc == num_tc && dev_maps->nr_ids == nr_ids) copy = true; /* allocate memory for queue storage */ @@ -2690,6 +2686,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, return -ENOMEM; } + new_dev_maps->nr_ids = nr_ids; new_dev_maps->num_tc = num_tc; } @@ -2770,7 +2767,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, goto out_no_old_maps; for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + j < dev_maps->nr_ids;) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) @@ -2804,12 +2801,12 @@ out_no_new_maps: goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = -1; j = netif_attrmask_next(j, possible_mask, dev_maps->nr_ids), + j < dev_maps->nr_ids;) { for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); - if (!netif_attr_test_mask(j, mask, nr_ids) || - !netif_attr_test_online(j, online_mask, nr_ids)) + if (!netif_attr_test_mask(j, mask, dev_maps->nr_ids) || + !netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) active |= remove_xps_queue(dev_maps, tci, index); for (i = dev_maps->num_tc - tc, tci++; --i; tci++) active |= remove_xps_queue(dev_maps, tci, index); @@ -3965,7 +3962,7 @@ static int __get_xps_queue_idx(struct net_device *dev, struct sk_buff *skb, struct xps_map *map; int queue_index = -1; - if (tc >= dev_maps->num_tc) + if (tc >= dev_maps->num_tc || tci >= dev_maps->nr_ids) return queue_index; tci *= dev_maps->num_tc; @@ -4004,7 +4001,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, if (dev_maps) { int tci = sk_rx_queue_get(sk); - if (tci >= 0 && tci < dev->num_rx_queues) + if (tci >= 0) queue_index = __get_xps_queue_idx(dev, skb, dev_maps, tci); } diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 1364d0f39cb0..bb08bdc88fa9 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1366,9 +1366,9 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, { struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + unsigned int index, nr_ids; int j, len, ret, tc = 0; unsigned long *mask; - unsigned int index; if (!netif_is_multiqueue(dev)) return -ENOENT; @@ -1387,19 +1387,20 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, goto err_rtnl_unlock; } - mask = bitmap_zalloc(nr_cpu_ids, GFP_KERNEL); + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_cpus_map); + nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; + + mask = bitmap_zalloc(nr_ids, GFP_KERNEL); if (!mask) { ret = -ENOMEM; - goto err_rtnl_unlock; + goto err_rcu_unlock; } - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, nr_cpu_ids), - j < nr_cpu_ids;) { + for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; @@ -1419,10 +1420,12 @@ out_no_maps: rtnl_unlock(); - len = bitmap_print_to_pagebuf(false, buf, mask, nr_cpu_ids); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; +err_rcu_unlock: + rcu_read_unlock(); err_rtnl_unlock: rtnl_unlock(); return ret; @@ -1473,9 +1476,9 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; + unsigned int index, nr_ids; int j, len, ret, tc = 0; unsigned long *mask; - unsigned int index; index = get_netdev_queue_index(queue); @@ -1488,19 +1491,20 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) goto err_rtnl_unlock; } - mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL); + rcu_read_lock(); + dev_maps = rcu_dereference(dev->xps_rxqs_map); + nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues; + + mask = bitmap_zalloc(nr_ids, GFP_KERNEL); if (!mask) { ret = -ENOMEM; - goto err_rtnl_unlock; + goto err_rcu_unlock; } - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_rxqs_map); if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues), - j < dev->num_rx_queues;) { + for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; @@ -1520,11 +1524,13 @@ out_no_maps: rtnl_unlock(); - len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues); + len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); return len < PAGE_SIZE ? len : -EINVAL; +err_rcu_unlock: + rcu_read_unlock(); err_rtnl_unlock: rtnl_unlock(); return ret; -- cgit v1.2.1 From 6f36158e058409ec5ceb4290541e77ae2648fc86 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:45 +0100 Subject: net: remove the xps possible_mask Remove the xps possible_mask. It was an optimization but we can just loop from 0 to nr_ids now that it is embedded in the xps dev_maps. That simplifies the code a bit. Suggested-by: Alexander Duyck Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 40 +++++++++++++--------------------------- net/core/net-sysfs.c | 4 ++-- 2 files changed, 15 insertions(+), 29 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 7530c95970a0..3ed8cb3a4061 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2523,33 +2523,28 @@ static void reset_xps_maps(struct net_device *dev, kfree_rcu(dev_maps, rcu); } -static void clean_xps_maps(struct net_device *dev, const unsigned long *mask, +static void clean_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, u16 offset, u16 count, bool is_rxqs_map) { - unsigned int nr_ids = dev_maps->nr_ids; bool active = false; int i, j; - for (j = -1; j = netif_attrmask_next(j, mask, nr_ids), j < nr_ids;) - active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, - count); + for (j = 0; j < dev_maps->nr_ids; j++) + active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) reset_xps_maps(dev, dev_maps, is_rxqs_map); if (!is_rxqs_map) { - for (i = offset + (count - 1); count--; i--) { + for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( - netdev_get_tx_queue(dev, i), - NUMA_NO_NODE); - } + netdev_get_tx_queue(dev, i), NUMA_NO_NODE); } } static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { - const unsigned long *possible_mask = NULL; struct xps_dev_maps *dev_maps; if (!static_key_false(&xps_needed)) @@ -2561,17 +2556,14 @@ static void netif_reset_xps_queues(struct net_device *dev, u16 offset, if (static_key_false(&xps_rxqs_needed)) { dev_maps = xmap_dereference(dev->xps_rxqs_map); if (dev_maps) - clean_xps_maps(dev, possible_mask, dev_maps, offset, - count, true); + clean_xps_maps(dev, dev_maps, offset, count, true); } dev_maps = xmap_dereference(dev->xps_cpus_map); if (!dev_maps) goto out_no_maps; - if (num_possible_cpus() > 1) - possible_mask = cpumask_bits(cpu_possible_mask); - clean_xps_maps(dev, possible_mask, dev_maps, offset, count, false); + clean_xps_maps(dev, dev_maps, offset, count, false); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2627,8 +2619,8 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, bool is_rxqs_map) { - const unsigned long *online_mask = NULL, *possible_mask = NULL; struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; int maps_sz, num_tc = 1, tc = 0; @@ -2656,10 +2648,8 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); - if (num_possible_cpus() > 1) { + if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); - possible_mask = cpumask_bits(cpu_possible_mask); - } dev_maps = xmap_dereference(dev->xps_cpus_map); nr_ids = nr_cpu_ids; } @@ -2710,8 +2700,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { /* copy maps belonging to foreign traffic classes */ for (i = tc, tci = j * num_tc; copy && i--; tci++) { /* fill in the new device map from the old device map */ @@ -2766,8 +2755,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!dev_maps) goto out_no_old_maps; - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < dev_maps->nr_ids;) { + for (j = 0; j < dev_maps->nr_ids; j++) { for (i = num_tc, tci = j * dev_maps->num_tc; i--; tci++) { map = xmap_dereference(dev_maps->attr_map[tci]); if (!map) @@ -2801,8 +2789,7 @@ out_no_new_maps: goto out_no_maps; /* removes tx-queue from unused CPUs/rx-queues */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, dev_maps->nr_ids), - j < dev_maps->nr_ids;) { + for (j = 0; j < dev_maps->nr_ids; j++) { for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) active |= remove_xps_queue(dev_maps, tci, index); if (!netif_attr_test_mask(j, mask, dev_maps->nr_ids) || @@ -2822,8 +2809,7 @@ out_no_maps: return 0; error: /* remove any maps that we added */ - for (j = -1; j = netif_attrmask_next(j, possible_mask, nr_ids), - j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { for (i = num_tc, tci = j * num_tc; i--; tci++) { new_map = xmap_dereference(new_dev_maps->attr_map[tci]); map = copy ? diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index bb08bdc88fa9..c762c435ff76 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1400,7 +1400,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; @@ -1504,7 +1504,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) if (!dev_maps || tc >= dev_maps->num_tc) goto out_no_maps; - for (j = -1; j = netif_attrmask_next(j, NULL, nr_ids), j < nr_ids;) { + for (j = 0; j < nr_ids; j++) { int i, tci = j * dev_maps->num_tc + tc; struct xps_map *map; -- cgit v1.2.1 From 044ab86d431b59b88966457dbb62679f274ec442 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:46 +0100 Subject: net: move the xps maps to an array Move the xps maps (xps_cpus_map and xps_rxqs_map) to an array in net_device. That will simplify a lot the code removing the need for lots of if/else conditionals as the correct map will be available using its offset in the array. This should not modify the xps maps behaviour in any way. Suggested-by: Alexander Duyck Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 73 ++++++++++++++++++++++------------------------------ net/core/net-sysfs.c | 6 ++--- 2 files changed, 34 insertions(+), 45 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 3ed8cb3a4061..af57e32bb543 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2511,31 +2511,34 @@ static bool remove_xps_queue_cpu(struct net_device *dev, static void reset_xps_maps(struct net_device *dev, struct xps_dev_maps *dev_maps, - bool is_rxqs_map) + enum xps_map_type type) { - if (is_rxqs_map) { - static_key_slow_dec_cpuslocked(&xps_rxqs_needed); - RCU_INIT_POINTER(dev->xps_rxqs_map, NULL); - } else { - RCU_INIT_POINTER(dev->xps_cpus_map, NULL); - } static_key_slow_dec_cpuslocked(&xps_needed); + if (type == XPS_RXQS) + static_key_slow_dec_cpuslocked(&xps_rxqs_needed); + + RCU_INIT_POINTER(dev->xps_maps[type], NULL); + kfree_rcu(dev_maps, rcu); } -static void clean_xps_maps(struct net_device *dev, - struct xps_dev_maps *dev_maps, u16 offset, u16 count, - bool is_rxqs_map) +static void clean_xps_maps(struct net_device *dev, enum xps_map_type type, + u16 offset, u16 count) { + struct xps_dev_maps *dev_maps; bool active = false; int i, j; + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (!dev_maps) + return; + for (j = 0; j < dev_maps->nr_ids; j++) active |= remove_xps_queue_cpu(dev, dev_maps, j, offset, count); if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); - if (!is_rxqs_map) { + if (type == XPS_CPUS) { for (i = offset + (count - 1); count--; i--) netdev_queue_numa_node_write( netdev_get_tx_queue(dev, i), NUMA_NO_NODE); @@ -2545,27 +2548,17 @@ static void clean_xps_maps(struct net_device *dev, static void netif_reset_xps_queues(struct net_device *dev, u16 offset, u16 count) { - struct xps_dev_maps *dev_maps; - if (!static_key_false(&xps_needed)) return; cpus_read_lock(); mutex_lock(&xps_map_mutex); - if (static_key_false(&xps_rxqs_needed)) { - dev_maps = xmap_dereference(dev->xps_rxqs_map); - if (dev_maps) - clean_xps_maps(dev, dev_maps, offset, count, true); - } - - dev_maps = xmap_dereference(dev->xps_cpus_map); - if (!dev_maps) - goto out_no_maps; + if (static_key_false(&xps_rxqs_needed)) + clean_xps_maps(dev, XPS_RXQS, offset, count); - clean_xps_maps(dev, dev_maps, offset, count, false); + clean_xps_maps(dev, XPS_CPUS, offset, count); -out_no_maps: mutex_unlock(&xps_map_mutex); cpus_read_unlock(); } @@ -2617,7 +2610,7 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, - u16 index, bool is_rxqs_map) + u16 index, enum xps_map_type type) { struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; const unsigned long *online_mask = NULL; @@ -2642,15 +2635,15 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } mutex_lock(&xps_map_mutex); - if (is_rxqs_map) { + + dev_maps = xmap_dereference(dev->xps_maps[type]); + if (type == XPS_RXQS) { maps_sz = XPS_RXQ_DEV_MAPS_SIZE(num_tc, dev->num_rx_queues); - dev_maps = xmap_dereference(dev->xps_rxqs_map); nr_ids = dev->num_rx_queues; } else { maps_sz = XPS_CPU_DEV_MAPS_SIZE(num_tc); if (num_possible_cpus() > 1) online_mask = cpumask_bits(cpu_online_mask); - dev_maps = xmap_dereference(dev->xps_cpus_map); nr_ids = nr_cpu_ids; } @@ -2683,7 +2676,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, tci = j * num_tc + tc; map = copy ? xmap_dereference(dev_maps->attr_map[tci]) : NULL; - map = expand_xps_map(map, j, index, is_rxqs_map); + map = expand_xps_map(map, j, index, type == XPS_RXQS); if (!map) goto error; @@ -2696,7 +2689,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (!dev_maps) { /* Increment static keys at most once per type */ static_key_slow_inc_cpuslocked(&xps_needed); - if (is_rxqs_map) + if (type == XPS_RXQS) static_key_slow_inc_cpuslocked(&xps_rxqs_needed); } @@ -2725,7 +2718,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, if (pos == map->len) map->queues[map->len++] = index; #ifdef CONFIG_NUMA - if (!is_rxqs_map) { + if (type == XPS_CPUS) { if (numa_node_id == -2) numa_node_id = cpu_to_node(j); else if (numa_node_id != cpu_to_node(j)) @@ -2746,10 +2739,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } } - if (is_rxqs_map) - rcu_assign_pointer(dev->xps_rxqs_map, new_dev_maps); - else - rcu_assign_pointer(dev->xps_cpus_map, new_dev_maps); + rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); /* Cleanup old maps */ if (!dev_maps) @@ -2778,12 +2768,11 @@ out_no_old_maps: active = true; out_no_new_maps: - if (!is_rxqs_map) { + if (type == XPS_CPUS) /* update Tx queue numa node */ netdev_queue_numa_node_write(netdev_get_tx_queue(dev, index), (numa_node_id >= 0) ? numa_node_id : NUMA_NO_NODE); - } if (!dev_maps) goto out_no_maps; @@ -2801,7 +2790,7 @@ out_no_new_maps: /* free map if not active */ if (!active) - reset_xps_maps(dev, dev_maps, is_rxqs_map); + reset_xps_maps(dev, dev_maps, type); out_no_maps: mutex_unlock(&xps_map_mutex); @@ -2833,7 +2822,7 @@ int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask, int ret; cpus_read_lock(); - ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, false); + ret = __netif_set_xps_queue(dev, cpumask_bits(mask), index, XPS_CPUS); cpus_read_unlock(); return ret; @@ -3983,7 +3972,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, if (!static_key_false(&xps_rxqs_needed)) goto get_cpus_map; - dev_maps = rcu_dereference(sb_dev->xps_rxqs_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_RXQS]); if (dev_maps) { int tci = sk_rx_queue_get(sk); @@ -3994,7 +3983,7 @@ static int get_xps_queue(struct net_device *dev, struct net_device *sb_dev, get_cpus_map: if (queue_index < 0) { - dev_maps = rcu_dereference(sb_dev->xps_cpus_map); + dev_maps = rcu_dereference(sb_dev->xps_maps[XPS_CPUS]); if (dev_maps) { unsigned int tci = skb->sender_cpu - 1; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index c762c435ff76..ca1f3b63cfad 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1388,7 +1388,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_cpus_map); + dev_maps = rcu_dereference(dev->xps_maps[XPS_CPUS]); nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; mask = bitmap_zalloc(nr_ids, GFP_KERNEL); @@ -1492,7 +1492,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) } rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_rxqs_map); + dev_maps = rcu_dereference(dev->xps_maps[XPS_RXQS]); nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues; mask = bitmap_zalloc(nr_ids, GFP_KERNEL); @@ -1566,7 +1566,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, } cpus_read_lock(); - err = __netif_set_xps_queue(dev, mask, index, true); + err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS); cpus_read_unlock(); rtnl_unlock(); -- cgit v1.2.1 From 402fbb992e13fc57e917ac7c0a07a8a3e2385858 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:47 +0100 Subject: net: add an helper to copy xps maps to the new dev_maps This patch adds an helper, xps_copy_dev_maps, to copy maps from dev_maps to new_dev_maps at a given index. The logic should be the same, with an improved code readability and maintenance. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 45 +++++++++++++++++++++++++-------------------- 1 file changed, 25 insertions(+), 20 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index af57e32bb543..00f6b41e11d8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2608,6 +2608,25 @@ static struct xps_map *expand_xps_map(struct xps_map *map, int attr_index, return new_map; } +/* Copy xps maps at a given index */ +static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, + struct xps_dev_maps *new_dev_maps, int index, + int tc, bool skip_tc) +{ + int i, tci = index * dev_maps->num_tc; + struct xps_map *map; + + /* copy maps belonging to foreign traffic classes */ + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && skip_tc) + continue; + + /* fill in the new device map from the old device map */ + map = xmap_dereference(dev_maps->attr_map[tci]); + RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); + } +} + /* Must be called under cpus_read_lock */ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) @@ -2694,23 +2713,16 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } for (j = 0; j < nr_ids; j++) { - /* copy maps belonging to foreign traffic classes */ - for (i = tc, tci = j * num_tc; copy && i--; tci++) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); - } + bool skip_tc = false; - /* We need to explicitly update tci as prevous loop - * could break out early if dev_maps is NULL. - */ tci = j * num_tc + tc; - if (netif_attr_test_mask(j, mask, nr_ids) && netif_attr_test_online(j, online_mask, nr_ids)) { /* add tx-queue to CPU/rx-queue maps */ int pos = 0; + skip_tc = true; + map = xmap_dereference(new_dev_maps->attr_map[tci]); while ((pos < map->len) && (map->queues[pos] != index)) pos++; @@ -2725,18 +2737,11 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, numa_node_id = -1; } #endif - } else if (copy) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); } - /* copy maps belonging to foreign traffic classes */ - for (i = num_tc - tc, tci++; copy && --i; tci++) { - /* fill in the new device map from the old device map */ - map = xmap_dereference(dev_maps->attr_map[tci]); - RCU_INIT_POINTER(new_dev_maps->attr_map[tci], map); - } + if (copy) + xps_copy_dev_maps(dev_maps, new_dev_maps, j, tc, + skip_tc); } rcu_assign_pointer(dev->xps_maps[type], new_dev_maps); -- cgit v1.2.1 From 132f743b01b85b8fae7e1f298bfd81a66b9389a8 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:48 +0100 Subject: net: improve queue removal readability in __netif_set_xps_queue Improve the readability of the loop removing tx-queue from unused CPUs/rx-queues in __netif_set_xps_queue. The change should only be cosmetic. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 00f6b41e11d8..c8ce2dfcc97d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2784,13 +2784,16 @@ out_no_new_maps: /* removes tx-queue from unused CPUs/rx-queues */ for (j = 0; j < dev_maps->nr_ids; j++) { - for (i = tc, tci = j * dev_maps->num_tc; i--; tci++) - active |= remove_xps_queue(dev_maps, tci, index); - if (!netif_attr_test_mask(j, mask, dev_maps->nr_ids) || - !netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) - active |= remove_xps_queue(dev_maps, tci, index); - for (i = dev_maps->num_tc - tc, tci++; --i; tci++) + tci = j * dev_maps->num_tc; + + for (i = 0; i < dev_maps->num_tc; i++, tci++) { + if (i == tc && + netif_attr_test_mask(j, mask, dev_maps->nr_ids) && + netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) + continue; + active |= remove_xps_queue(dev_maps, tci, index); + } } /* free map if not active */ -- cgit v1.2.1 From d7be87a687cc261d663dcf97c01056f71398f9f9 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:49 +0100 Subject: net-sysfs: move the rtnl unlock up in the xps show helpers Now that nr_ids and num_tc are stored in the xps dev_maps, which are RCU protected, we do not have the need to protect the maps in the rtnl lock. Move the rtnl unlock up so we reduce the rtnl locking section. We also increase the reference count on the subordinate device if any, as we don't want this device to be freed while we use it (now that the rtnl lock isn't protecting it in the whole function). Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 25 +++++++++++-------------- 1 file changed, 11 insertions(+), 14 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index ca1f3b63cfad..094fea082649 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1383,10 +1383,14 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, tc = netdev_txq_to_tc(dev, index); if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; + rtnl_unlock(); + return -EINVAL; } + /* Make sure the subordinate device can't be freed */ + get_device(&dev->dev); + rtnl_unlock(); + rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps[XPS_CPUS]); nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; @@ -1417,8 +1421,7 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } out_no_maps: rcu_read_unlock(); - - rtnl_unlock(); + put_device(&dev->dev); len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); @@ -1426,8 +1429,7 @@ out_no_maps: err_rcu_unlock: rcu_read_unlock(); -err_rtnl_unlock: - rtnl_unlock(); + put_device(&dev->dev); return ret; } @@ -1486,10 +1488,9 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) return restart_syscall(); tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - ret = -EINVAL; - goto err_rtnl_unlock; - } + rtnl_unlock(); + if (tc < 0) + return -EINVAL; rcu_read_lock(); dev_maps = rcu_dereference(dev->xps_maps[XPS_RXQS]); @@ -1522,8 +1523,6 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) out_no_maps: rcu_read_unlock(); - rtnl_unlock(); - len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); @@ -1531,8 +1530,6 @@ out_no_maps: err_rcu_unlock: rcu_read_unlock(); -err_rtnl_unlock: - rtnl_unlock(); return ret; } -- cgit v1.2.1 From 2db6cdaebac83c13acb165594b09282fa03cec89 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:50 +0100 Subject: net-sysfs: move the xps cpus/rxqs retrieval in a common function Most of the xps_cpus_show and xps_rxqs_show functions share the same logic. Having it in two different functions does not help maintenance. This patch moves their common logic into a new function, xps_queue_show, to improve this. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 125 ++++++++++++++++++++------------------------------- 1 file changed, 48 insertions(+), 77 deletions(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 094fea082649..562a42fcd437 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1361,44 +1361,27 @@ static const struct attribute_group dql_group = { #endif /* CONFIG_BQL */ #ifdef CONFIG_XPS -static ssize_t xps_cpus_show(struct netdev_queue *queue, - char *buf) +static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, + int tc, char *buf, enum xps_map_type type) { - struct net_device *dev = queue->dev; struct xps_dev_maps *dev_maps; - unsigned int index, nr_ids; - int j, len, ret, tc = 0; unsigned long *mask; - - if (!netif_is_multiqueue(dev)) - return -ENOENT; - - index = get_netdev_queue_index(queue); - - if (!rtnl_trylock()) - return restart_syscall(); - - /* If queue belongs to subordinate dev use its map */ - dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; - - tc = netdev_txq_to_tc(dev, index); - if (tc < 0) { - rtnl_unlock(); - return -EINVAL; - } - - /* Make sure the subordinate device can't be freed */ - get_device(&dev->dev); - rtnl_unlock(); + unsigned int nr_ids; + int j, len; rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps[XPS_CPUS]); - nr_ids = dev_maps ? dev_maps->nr_ids : nr_cpu_ids; + dev_maps = rcu_dereference(dev->xps_maps[type]); + + /* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0 + * when dev_maps hasn't been allocated yet, to be backward compatible. + */ + nr_ids = dev_maps ? dev_maps->nr_ids : + (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); mask = bitmap_zalloc(nr_ids, GFP_KERNEL); if (!mask) { - ret = -ENOMEM; - goto err_rcu_unlock; + rcu_read_unlock(); + return -ENOMEM; } if (!dev_maps || tc >= dev_maps->num_tc) @@ -1421,16 +1404,44 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue, } out_no_maps: rcu_read_unlock(); - put_device(&dev->dev); len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); bitmap_free(mask); + return len < PAGE_SIZE ? len : -EINVAL; +} + +static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf) +{ + struct net_device *dev = queue->dev; + unsigned int index; + int len, tc; + + if (!netif_is_multiqueue(dev)) + return -ENOENT; + + index = get_netdev_queue_index(queue); + + if (!rtnl_trylock()) + return restart_syscall(); + + /* If queue belongs to subordinate dev use its map */ + dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev; + + tc = netdev_txq_to_tc(dev, index); + if (tc < 0) { + rtnl_unlock(); + return -EINVAL; + } + + /* Make sure the subordinate device can't be freed */ + get_device(&dev->dev); + rtnl_unlock(); + + len = xps_queue_show(dev, index, tc, buf, XPS_CPUS); -err_rcu_unlock: - rcu_read_unlock(); put_device(&dev->dev); - return ret; + return len; } static ssize_t xps_cpus_store(struct netdev_queue *queue, @@ -1477,10 +1488,8 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) { struct net_device *dev = queue->dev; - struct xps_dev_maps *dev_maps; - unsigned int index, nr_ids; - int j, len, ret, tc = 0; - unsigned long *mask; + unsigned int index; + int tc; index = get_netdev_queue_index(queue); @@ -1492,45 +1501,7 @@ static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf) if (tc < 0) return -EINVAL; - rcu_read_lock(); - dev_maps = rcu_dereference(dev->xps_maps[XPS_RXQS]); - nr_ids = dev_maps ? dev_maps->nr_ids : dev->num_rx_queues; - - mask = bitmap_zalloc(nr_ids, GFP_KERNEL); - if (!mask) { - ret = -ENOMEM; - goto err_rcu_unlock; - } - - if (!dev_maps || tc >= dev_maps->num_tc) - goto out_no_maps; - - for (j = 0; j < nr_ids; j++) { - int i, tci = j * dev_maps->num_tc + tc; - struct xps_map *map; - - map = rcu_dereference(dev_maps->attr_map[tci]); - if (!map) - continue; - - for (i = map->len; i--;) { - if (map->queues[i] == index) { - set_bit(j, mask); - break; - } - } - } -out_no_maps: - rcu_read_unlock(); - - len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids); - bitmap_free(mask); - - return len < PAGE_SIZE ? len : -EINVAL; - -err_rcu_unlock: - rcu_read_unlock(); - return ret; + return xps_queue_show(dev, index, tc, buf, XPS_RXQS); } static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf, -- cgit v1.2.1 From 2d05bf015308275f7c67a780f70026077285cfc0 Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:51 +0100 Subject: net: fix use after free in xps When setting up an new dev_maps in __netif_set_xps_queue, we remove and free maps from unused CPUs/rx-queues near the end of the function; by calling remove_xps_queue. However it's possible those maps are also part of the old not-freed-yet dev_maps, which might be used concurrently. When that happens, a map can be freed while its corresponding entry in the old dev_maps table isn't NULLed, leading to: "BUG: KASAN: use-after-free" in different places. This fixes the map freeing logic for unused CPUs/rx-queues, to also NULL the map entries from the old dev_maps table. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c8ce2dfcc97d..d5f6ba209f1e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2460,7 +2460,7 @@ static DEFINE_MUTEX(xps_map_mutex); rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) static bool remove_xps_queue(struct xps_dev_maps *dev_maps, - int tci, u16 index) + struct xps_dev_maps *old_maps, int tci, u16 index) { struct xps_map *map = NULL; int pos; @@ -2479,6 +2479,8 @@ static bool remove_xps_queue(struct xps_dev_maps *dev_maps, break; } + if (old_maps) + RCU_INIT_POINTER(old_maps->attr_map[tci], NULL); RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); return false; @@ -2499,7 +2501,7 @@ static bool remove_xps_queue_cpu(struct net_device *dev, int i, j; for (i = count, j = offset; i--; j++) { - if (!remove_xps_queue(dev_maps, tci, j)) + if (!remove_xps_queue(dev_maps, NULL, tci, j)) break; } @@ -2631,7 +2633,7 @@ static void xps_copy_dev_maps(struct xps_dev_maps *dev_maps, int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, u16 index, enum xps_map_type type) { - struct xps_dev_maps *dev_maps, *new_dev_maps = NULL; + struct xps_dev_maps *dev_maps, *new_dev_maps = NULL, *old_dev_maps = NULL; const unsigned long *online_mask = NULL; bool active = false, copy = false; int i, j, tci, numa_node_id = -2; @@ -2766,7 +2768,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, } } - kfree_rcu(dev_maps, rcu); + old_dev_maps = dev_maps; out_no_old_maps: dev_maps = new_dev_maps; @@ -2792,10 +2794,15 @@ out_no_new_maps: netif_attr_test_online(j, online_mask, dev_maps->nr_ids)) continue; - active |= remove_xps_queue(dev_maps, tci, index); + active |= remove_xps_queue(dev_maps, + copy ? old_dev_maps : NULL, + tci, index); } } + if (old_dev_maps) + kfree_rcu(old_dev_maps, rcu); + /* free map if not active */ if (!active) reset_xps_maps(dev, dev_maps, type); -- cgit v1.2.1 From 75b2758abc355c410dd335d45b2d40f920e27cde Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Thu, 18 Mar 2021 19:37:52 +0100 Subject: net: NULL the old xps map entries when freeing them In __netif_set_xps_queue, old map entries from the old dev_maps are freed but their corresponding entry in the old dev_maps aren't NULLed. Fix this. Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index d5f6ba209f1e..4961fc2e9b19 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2764,6 +2764,7 @@ int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask, continue; } + RCU_INIT_POINTER(dev_maps->attr_map[tci], NULL); kfree_rcu(map, rcu); } } -- cgit v1.2.1 From a835f9034efbb699f307575bead2607c2fbc93ac Mon Sep 17 00:00:00 2001 From: Xiong Zhenwu Date: Thu, 18 Mar 2021 04:52:13 -0700 Subject: /net/core/: fix misspellings using codespell tool A typo is found out by codespell tool in 1734th line of drop_monitor.c: $ codespell ./net/core/ ./net/core/drop_monitor.c:1734: guarnateed ==> guaranteed Fix a typo found by codespell. Signed-off-by: Xiong Zhenwu Signed-off-by: David S. Miller --- net/core/drop_monitor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 571f191c06d9..1eb02c2236f2 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1731,7 +1731,7 @@ static void exit_net_drop_monitor(void) /* * Because of the module_get/put we do in the trace state change path - * we are guarnateed not to have any current users when we get here + * we are guaranteed not to have any current users when we get here */ for_each_possible_cpu(cpu) { -- cgit v1.2.1 From 919067cc845f323a80b6fe987b64238bd82d309e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 19 Mar 2021 10:39:33 -0700 Subject: net: add CONFIG_PCPU_DEV_REFCNT I was working on a syzbot issue, claiming one device could not be dismantled because its refcount was -1 unregister_netdevice: waiting for sit0 to become free. Usage count = -1 It would be nice if syzbot could trigger a warning at the time this reference count became negative. This patch adds CONFIG_PCPU_DEV_REFCNT options which defaults to per cpu variables (as before this patch) on SMP builds. v2: free_dev label in alloc_netdev_mqs() is moved to avoid a compiler warning (-Wunused-label), as reported by kernel test robot Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 4961fc2e9b19..be941ed754ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10312,11 +10312,15 @@ EXPORT_SYMBOL(register_netdev); int netdev_refcnt_read(const struct net_device *dev) { +#ifdef CONFIG_PCPU_DEV_REFCNT int i, refcnt = 0; for_each_possible_cpu(i) refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); return refcnt; +#else + return refcount_read(&dev->dev_refcnt); +#endif } EXPORT_SYMBOL(netdev_refcnt_read); @@ -10674,9 +10678,11 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; +#ifdef CONFIG_PCPU_DEV_REFCNT dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; +#endif if (dev_addr_init(dev)) goto free_pcpu; @@ -10740,8 +10746,10 @@ free_all: return NULL; free_pcpu: +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); free_dev: +#endif netdev_freemem(dev); return NULL; } @@ -10783,8 +10791,10 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); +#ifdef CONFIG_PCPU_DEV_REFCNT free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; +#endif free_percpu(dev->xdp_bulkq); dev->xdp_bulkq = NULL; -- cgit v1.2.1 From 5da9ace3405f40d8d93c1b519696f47bc4402318 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 22 Mar 2021 13:30:19 +0200 Subject: net: make xps_needed and xps_rxqs_needed static Since their introduction in commit 04157469b7b8 ("net: Use static_key for XPS maps"), xps_needed and xps_rxqs_needed were never used outside net/core/dev.c, so I don't really understand why they were exported as symbols in the first place. This is needed in order to silence a "make W=1" warning about these static keys not being declared as static variables, but not having a previous declaration in a header file nonetheless. Cc: Amritha Nambiar Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index be941ed754ac..ffab3928eeeb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2451,10 +2451,8 @@ int netdev_txq_to_tc(struct net_device *dev, unsigned int txq) EXPORT_SYMBOL(netdev_txq_to_tc); #ifdef CONFIG_XPS -struct static_key xps_needed __read_mostly; -EXPORT_SYMBOL(xps_needed); -struct static_key xps_rxqs_needed __read_mostly; -EXPORT_SYMBOL(xps_rxqs_needed); +static struct static_key xps_needed __read_mostly; +static struct static_key xps_rxqs_needed __read_mostly; static DEFINE_MUTEX(xps_map_mutex); #define xmap_dereference(P) \ rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex)) -- cgit v1.2.1 From 744b8376632208137fe4acc9967b93e2970732a3 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Mon, 22 Mar 2021 13:31:48 +0200 Subject: net: move the ptype_all and ptype_base declarations to include/linux/netdevice.h ptype_all and ptype_base are declared in net/core/dev.c as non-static, because they are used by net-procfs.c too. However, a "make W=1" build complains that there was no previous declaration of ptype_all and ptype_base in a header file, so this way of declaring things constitutes a violation of coding style. Let's move the extern declarations of ptype_all and ptype_base to the linux/netdevice.h file, which is included by net-procfs.c too. Signed-off-by: Vladimir Oltean Signed-off-by: David S. Miller --- net/core/net-procfs.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net/core') diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index c714e6a9dad4..d8b9dbabd4a4 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -10,9 +10,6 @@ #define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1)) #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o)) -extern struct list_head ptype_all __read_mostly; -extern struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; - static inline struct net_device *dev_from_same_bucket(struct seq_file *seq, loff_t *pos) { struct net *net = seq_file_net(seq); -- cgit v1.2.1 From 7f08ec6e04269ce53b664761c9108b44ed2f54ab Mon Sep 17 00:00:00 2001 From: Antoine Tenart Date: Mon, 22 Mar 2021 16:43:29 +0100 Subject: net-sysfs: remove possible sleep from an RCU read-side critical section xps_queue_show is mostly made of an RCU read-side critical section and calls bitmap_zalloc with GFP_KERNEL in the middle of it. That is not allowed as this call may sleep and such behaviours aren't allowed in RCU read-side critical sections. Fix this by using GFP_NOWAIT instead. Fixes: 5478fcd0f483 ("net: embed nr_ids in the xps maps") Reported-by: kernel test robot Suggested-by: Matthew Wilcox Signed-off-by: Antoine Tenart Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 562a42fcd437..f6197774048b 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -1378,7 +1378,7 @@ static ssize_t xps_queue_show(struct net_device *dev, unsigned int index, nr_ids = dev_maps ? dev_maps->nr_ids : (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues); - mask = bitmap_zalloc(nr_ids, GFP_KERNEL); + mask = bitmap_zalloc(nr_ids, GFP_NOWAIT); if (!mask) { rcu_read_unlock(); return -ENOMEM; -- cgit v1.2.1 From add2d73631070c951b0de81a01d1463a15cfbd47 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 22 Mar 2021 11:21:45 -0700 Subject: net: set initial device refcount to 1 When adding CONFIG_PCPU_DEV_REFCNT, I forgot that the initial net device refcount was 0. When CONFIG_PCPU_DEV_REFCNT is not set, this means the first dev_hold() triggers an illegal refcount operation (addition on 0) refcount_t: addition on 0; use-after-free. WARNING: CPU: 0 PID: 1 at lib/refcount.c:25 refcount_warn_saturate+0x128/0x1a4 Fix is to change initial (and final) refcount to be 1. Also add a missing kerneldoc piece, as reported by Stephen Rothwell. Fixes: 919067cc845f ("net: add CONFIG_PCPU_DEV_REFCNT") Signed-off-by: Eric Dumazet Reported-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: David S. Miller --- net/core/dev.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index ffab3928eeeb..c9a496f5e687 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10346,7 +10346,7 @@ static void netdev_wait_allrefs(struct net_device *dev) rebroadcast_time = warning_time = jiffies; refcnt = netdev_refcnt_read(dev); - while (refcnt != 0) { + while (refcnt != 1) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -10383,7 +10383,7 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt && time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt != 1 && time_after(jiffies, warning_time + 10 * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; @@ -10459,7 +10459,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(netdev_refcnt_read(dev)); + BUG_ON(netdev_refcnt_read(dev) != 1); BUG_ON(!list_empty(&dev->ptype_all)); BUG_ON(!list_empty(&dev->ptype_specific)); WARN_ON(rcu_access_pointer(dev->ip_ptr)); @@ -10680,6 +10680,9 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) goto free_dev; + dev_hold(dev); +#else + refcount_set(&dev->dev_refcnt, 1); #endif if (dev_addr_init(dev)) -- cgit v1.2.1 From 5aa3afe107d9099fc0dea2acf82c3e3c8f0f20e2 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 23 Mar 2021 07:49:23 +0100 Subject: net: make unregister netdev warning timeout configurable netdev_wait_allrefs() issues a warning if refcount does not drop to 0 after 10 seconds. While 10 second wait generally should not happen under normal workload in normal environment, it seems to fire falsely very often during fuzzing and/or in qemu emulation (~10x slower). At least it's not possible to understand if it's really a false positive or not. Automated testing generally bumps all timeouts to very high values to avoid flake failures. Add net.core.netdev_unregister_timeout_secs sysctl to make the timeout configurable for automated testing systems. Lowering the timeout may also be useful for e.g. manual bisection. The default value matches the current behavior. Signed-off-by: Dmitry Vyukov Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=211877 Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Signed-off-by: David S. Miller --- net/core/dev.c | 6 +++++- net/core/sysctl_net_core.c | 10 ++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index c9a496f5e687..515309573cb8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10322,6 +10322,8 @@ int netdev_refcnt_read(const struct net_device *dev) } EXPORT_SYMBOL(netdev_refcnt_read); +int netdev_unregister_timeout_secs __read_mostly = 10; + #define WAIT_REFS_MIN_MSECS 1 #define WAIT_REFS_MAX_MSECS 250 /** @@ -10383,7 +10385,9 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt != 1 && time_after(jiffies, warning_time + 10 * HZ)) { + if (refcnt && + time_after(jiffies, warning_time + + netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", dev->name, refcnt); warning_time = jiffies; diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 4567de519603..d84c8a1b280e 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -24,6 +24,7 @@ static int two = 2; static int three = 3; +static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -570,6 +571,15 @@ static struct ctl_table net_core_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ONE, }, + { + .procname = "netdev_unregister_timeout_secs", + .data = &netdev_unregister_timeout_secs, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &int_3600, + }, { } }; -- cgit v1.2.1 From ddb94eafab8b597b05904c8277194ea2d6357fa9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 24 Mar 2021 02:30:32 +0100 Subject: net: resolve forwarding path from virtual netdevice and HW destination address This patch adds dev_fill_forward_path() which resolves the path to reach the real netdevice from the IP forwarding side. This function takes as input the netdevice and the destination hardware address and it walks down the devices calling .ndo_fill_forward_path() for each device until the real device is found. For instance, assuming the following topology: IP forwarding / \ br0 eth0 / \ eth1 eth2 . . . ethX ab:cd:ef:ab:cd:ef where eth1 and eth2 are bridge ports and eth0 provides WAN connectivity. ethX is the interface in another box which is connected to the eth1 bridge port. For packets going through IP forwarding to br0 whose destination MAC address is ab:cd:ef:ab:cd:ef, dev_fill_forward_path() provides the following path: br0 -> eth1 .ndo_fill_forward_path for br0 looks up at the FDB for the bridge port from the destination MAC address to get the bridge port eth1. This information allows to create a fast path that bypasses the classic bridge and IP forwarding paths, so packets go directly from the bridge port eth1 to eth0 (wan interface) and vice versa. fast path .------------------------. / \ | IP forwarding | | / \ \/ | br0 eth0 . / \ -> eth1 eth2 . . . ethX ab:cd:ef:ab:cd:ef Signed-off-by: Pablo Neira Ayuso Signed-off-by: David S. Miller --- net/core/dev.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 515309573cb8..4bb6dcdbed8b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -848,6 +848,52 @@ int dev_fill_metadata_dst(struct net_device *dev, struct sk_buff *skb) } EXPORT_SYMBOL_GPL(dev_fill_metadata_dst); +static struct net_device_path *dev_fwd_path(struct net_device_path_stack *stack) +{ + int k = stack->num_paths++; + + if (WARN_ON_ONCE(k >= NET_DEVICE_PATH_STACK_MAX)) + return NULL; + + return &stack->path[k]; +} + +int dev_fill_forward_path(const struct net_device *dev, const u8 *daddr, + struct net_device_path_stack *stack) +{ + const struct net_device *last_dev; + struct net_device_path_ctx ctx = { + .dev = dev, + .daddr = daddr, + }; + struct net_device_path *path; + int ret = 0; + + stack->num_paths = 0; + while (ctx.dev && ctx.dev->netdev_ops->ndo_fill_forward_path) { + last_dev = ctx.dev; + path = dev_fwd_path(stack); + if (!path) + return -1; + + memset(path, 0, sizeof(struct net_device_path)); + ret = ctx.dev->netdev_ops->ndo_fill_forward_path(&ctx, path); + if (ret < 0) + return -1; + + if (WARN_ON_ONCE(last_dev == ctx.dev)) + return -1; + } + path = dev_fwd_path(stack); + if (!path) + return -1; + path->type = DEV_PATH_ETHERNET; + path->dev = ctx.dev; + + return ret; +} +EXPORT_SYMBOL_GPL(dev_fill_forward_path); + /** * __dev_get_by_name - find a device by its name * @net: the applicable net namespace -- cgit v1.2.1 From 897b9fae7a8ac1de2372a15234c944831c83ec26 Mon Sep 17 00:00:00 2001 From: Lu Wei Date: Thu, 25 Mar 2021 14:38:22 +0800 Subject: net: core: Fix a typo in dev_addr_lists.c Modify "funciton" to "function" in net/core/dev_addr_lists.c. Reported-by: Hulk Robot Signed-off-by: Lu Wei Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index fa1c37ec40c9..1e5bde241185 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -228,7 +228,7 @@ EXPORT_SYMBOL(__hw_addr_unsync); * @sync: function to call if address should be added * @unsync: function to call if address should be removed * - * This funciton is intended to be called from the ndo_set_rx_mode + * This function is intended to be called from the ndo_set_rx_mode * function of devices that require explicit address add/remove * notifications. The unsync function may be NULL in which case * the addresses requiring removal will simply be removed without -- cgit v1.2.1 From 6c996e19949b34d7edebed4f6b0511145c036404 Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 25 Mar 2021 15:52:45 +0100 Subject: net: change netdev_unregister_timeout_secs min value to 1 netdev_unregister_timeout_secs=0 can lead to printing the "waiting for dev to become free" message every jiffy. This is too frequent and unnecessary. Set the min value to 1 second. Also fix the merge issue introduced by "net: make unregister netdev warning timeout configurable": it changed "refcnt != 1" to "refcnt". Signed-off-by: Dmitry Vyukov Suggested-by: Eric Dumazet Fixes: 5aa3afe107d9 ("net: make unregister netdev warning timeout configurable") Cc: netdev@vger.kernel.org Cc: linux-kernel@vger.kernel.org Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/core/sysctl_net_core.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 48b529d59157..b4c67a5be606 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -10460,7 +10460,7 @@ static void netdev_wait_allrefs(struct net_device *dev) refcnt = netdev_refcnt_read(dev); - if (refcnt && + if (refcnt != 1 && time_after(jiffies, warning_time + netdev_unregister_timeout_secs * HZ)) { pr_emerg("unregister_netdevice: waiting for %s to become free. Usage count = %d\n", diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index d84c8a1b280e..c8496c1142c9 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -577,7 +577,7 @@ static struct ctl_table net_core_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = SYSCTL_ZERO, + .extra1 = SYSCTL_ONE, .extra2 = &int_3600, }, { } -- cgit v1.2.1 From 7bd1590d4eba1583f6ee85e8cfe556505f761e19 Mon Sep 17 00:00:00 2001 From: Martin KaFai Lau Date: Wed, 24 Mar 2021 18:52:52 -0700 Subject: bpf: selftests: Add kfunc_call test This patch adds a few kernel function bpf_kfunc_call_test*() for the selftest's test_run purpose. They will be allowed for tc_cls prog. The selftest calling the kernel function bpf_kfunc_call_test*() is also added in this patch. Signed-off-by: Martin KaFai Lau Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210325015252.1551395-1-kafai@fb.com --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net/core') diff --git a/net/core/filter.c b/net/core/filter.c index 17dc159ec40c..cae56d08a670 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -9813,6 +9813,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .check_kfunc_call = bpf_prog_test_check_kfunc_call, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { -- cgit v1.2.1 From af825087433fb94a431edf387c1265463fce3bd1 Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Sat, 27 Mar 2021 16:15:50 +0800 Subject: net: core: Correct function name dev_uc_flush() in the kerneldoc Fix the following W=1 kernel build warning(s): net/core/dev_addr_lists.c:732: warning: expecting prototype for dev_uc_flush(). Prototype was for dev_uc_init() instead Reported-by: Hulk Robot Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/core/dev_addr_lists.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c index 1e5bde241185..45ae6eeb2964 100644 --- a/net/core/dev_addr_lists.c +++ b/net/core/dev_addr_lists.c @@ -723,7 +723,7 @@ void dev_uc_flush(struct net_device *dev) EXPORT_SYMBOL(dev_uc_flush); /** - * dev_uc_flush - Init unicast address list + * dev_uc_init - Init unicast address list * @dev: device * * Init unicast address list. -- cgit v1.2.1 From bb2882bc6c54672b4c57a2108a18ec3acc7c878c Mon Sep 17 00:00:00 2001 From: Xiongfeng Wang Date: Sat, 27 Mar 2021 16:15:51 +0800 Subject: net: core: Correct function name netevent_unregister_notifier() in the kerneldoc Fix the following W=1 kernel build warning(s): net/core/netevent.c:45: warning: expecting prototype for netevent_unregister_notifier(). Prototype was for unregister_netevent_notifier() instead Reported-by: Hulk Robot Signed-off-by: Xiongfeng Wang Signed-off-by: David S. Miller --- net/core/netevent.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/netevent.c b/net/core/netevent.c index d76ed7739c70..5bb615e963cc 100644 --- a/net/core/netevent.c +++ b/net/core/netevent.c @@ -32,7 +32,7 @@ int register_netevent_notifier(struct notifier_block *nb) EXPORT_SYMBOL_GPL(register_netevent_notifier); /** - * netevent_unregister_notifier - unregister a netevent notifier block + * unregister_netevent_notifier - unregister a netevent notifier block * @nb: notifier * * Unregister a notifier previously registered by -- cgit v1.2.1 From 37f0e514db660f03f8982b8f4fbbd4b2740abe7d Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:22 -0700 Subject: skmsg: Lock ingress_skb when purging Currently we purge the ingress_skb queue only when psock refcnt goes down to 0, so locking the queue is not necessary, but in order to be called during ->close, we have to lock it here. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-2-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 07f54015238a..bebf84ed4e30 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -634,7 +634,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; - while ((skb = __skb_dequeue(&psock->ingress_skb)) != NULL) { + while ((skb = skb_dequeue(&psock->ingress_skb)) != NULL) { skb_bpf_redirect_clear(skb); kfree_skb(skb); } -- cgit v1.2.1 From b01fd6e802b6d0a635176f943315670b679d8d7b Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:23 -0700 Subject: skmsg: Introduce a spinlock to protect ingress_msg Currently we rely on lock_sock to protect ingress_msg, it is too big for this, we can actually just use a spinlock to protect this list like protecting other skb queues. __tcp_bpf_recvmsg() is still special because of peeking, it still has to use lock_sock. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-3-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index bebf84ed4e30..305dddc51857 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -592,6 +592,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) INIT_WORK(&psock->work, sk_psock_backlog); INIT_LIST_HEAD(&psock->ingress_msg); + spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); sk_psock_set_state(psock, SK_PSOCK_TX_ENABLED); @@ -638,7 +639,9 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); kfree_skb(skb); } + spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); + spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_link_destroy(struct sk_psock *psock) -- cgit v1.2.1 From 0739cd28f2645e814586c7536ba5da9825cb8029 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:24 -0700 Subject: net: Introduce skb_send_sock() for sock_map We only have skb_send_sock_locked() which requires callers to use lock_sock(). Introduce a variant skb_send_sock() which locks on its own, callers do not need to lock it any more. This will save us from adding a ->sendmsg_locked for each protocol. To reuse the code, pass function pointers to __skb_send_sock() and build skb_send_sock() and skb_send_sock_locked() on top. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Reviewed-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210331023237.41094-4-xiyou.wangcong@gmail.com --- net/core/skbuff.c | 55 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 48 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e8320b5d651a..3ad9e8425ab2 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -2500,9 +2500,32 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset, } EXPORT_SYMBOL_GPL(skb_splice_bits); -/* Send skb data on a socket. Socket must be locked. */ -int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, - int len) +static int sendmsg_unlocked(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendmsg(sock, msg, vec, num, size); +} + +static int sendpage_unlocked(struct sock *sk, struct page *page, int offset, + size_t size, int flags) +{ + struct socket *sock = sk->sk_socket; + + if (!sock) + return -EINVAL; + return kernel_sendpage(sock, page, offset, size, flags); +} + +typedef int (*sendmsg_func)(struct sock *sk, struct msghdr *msg, + struct kvec *vec, size_t num, size_t size); +typedef int (*sendpage_func)(struct sock *sk, struct page *page, int offset, + size_t size, int flags); +static int __skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, + int len, sendmsg_func sendmsg, sendpage_func sendpage) { unsigned int orig_len = len; struct sk_buff *head = skb; @@ -2522,7 +2545,8 @@ do_frag_list: memset(&msg, 0, sizeof(msg)); msg.msg_flags = MSG_DONTWAIT; - ret = kernel_sendmsg_locked(sk, &msg, &kv, 1, slen); + ret = INDIRECT_CALL_2(sendmsg, kernel_sendmsg_locked, + sendmsg_unlocked, sk, &msg, &kv, 1, slen); if (ret <= 0) goto error; @@ -2553,9 +2577,11 @@ do_frag_list: slen = min_t(size_t, len, skb_frag_size(frag) - offset); while (slen) { - ret = kernel_sendpage_locked(sk, skb_frag_page(frag), - skb_frag_off(frag) + offset, - slen, MSG_DONTWAIT); + ret = INDIRECT_CALL_2(sendpage, kernel_sendpage_locked, + sendpage_unlocked, sk, + skb_frag_page(frag), + skb_frag_off(frag) + offset, + slen, MSG_DONTWAIT); if (ret <= 0) goto error; @@ -2587,8 +2613,23 @@ out: error: return orig_len == len ? ret : orig_len - len; } + +/* Send skb data on a socket. Socket must be locked. */ +int skb_send_sock_locked(struct sock *sk, struct sk_buff *skb, int offset, + int len) +{ + return __skb_send_sock(sk, skb, offset, len, kernel_sendmsg_locked, + kernel_sendpage_locked); +} EXPORT_SYMBOL_GPL(skb_send_sock_locked); +/* Send skb data on a socket. Socket must be unlocked. */ +int skb_send_sock(struct sock *sk, struct sk_buff *skb, int offset, int len) +{ + return __skb_send_sock(sk, skb, offset, len, sendmsg_unlocked, + sendpage_unlocked); +} + /** * skb_store_bits - store bits from kernel buffer to skb * @skb: destination buffer -- cgit v1.2.1 From 799aa7f98d53e0f541fa6b4dc9aa47b4ff2178e3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:25 -0700 Subject: skmsg: Avoid lock_sock() in sk_psock_backlog() We do not have to lock the sock to avoid losing sk_socket, instead we can purge all the ingress queues when we close the socket. Sending or receiving packets after orphaning socket makes no sense. We do purge these queues when psock refcnt reaches zero but here we want to purge them explicitly in sock_map_close(). There are also some nasty race conditions on testing bit SK_PSOCK_TX_ENABLED and queuing/canceling the psock work, we can expand psock->ingress_lock a bit to protect them too. As noticed by John, we still have to lock the psock->work, because the same work item could be running concurrently on different CPU's. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-5-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 50 ++++++++++++++++++++++++++++++++++---------------- net/core/sock_map.c | 1 + 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 305dddc51857..9c25020086a9 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -497,7 +497,7 @@ static int sk_psock_handle_skb(struct sk_psock *psock, struct sk_buff *skb, if (!ingress) { if (!sock_writeable(psock->sk)) return -EAGAIN; - return skb_send_sock_locked(psock->sk, skb, off, len); + return skb_send_sock(psock->sk, skb, off, len); } return sk_psock_skb_ingress(psock, skb); } @@ -511,8 +511,7 @@ static void sk_psock_backlog(struct work_struct *work) u32 len, off; int ret; - /* Lock sock to avoid losing sk_socket during loop. */ - lock_sock(psock->sk); + mutex_lock(&psock->work_mutex); if (state->skb) { skb = state->skb; len = state->len; @@ -529,7 +528,7 @@ start: skb_bpf_redirect_clear(skb); do { ret = -EIO; - if (likely(psock->sk->sk_socket)) + if (!sock_flag(psock->sk, SOCK_DEAD)) ret = sk_psock_handle_skb(psock, skb, off, len, ingress); if (ret <= 0) { @@ -553,7 +552,7 @@ start: kfree_skb(skb); } end: - release_sock(psock->sk); + mutex_unlock(&psock->work_mutex); } struct sk_psock *sk_psock_init(struct sock *sk, int node) @@ -591,6 +590,7 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) spin_lock_init(&psock->link_lock); INIT_WORK(&psock->work, sk_psock_backlog); + mutex_init(&psock->work_mutex); INIT_LIST_HEAD(&psock->ingress_msg); spin_lock_init(&psock->ingress_lock); skb_queue_head_init(&psock->ingress_skb); @@ -631,7 +631,7 @@ static void __sk_psock_purge_ingress_msg(struct sk_psock *psock) } } -static void sk_psock_zap_ingress(struct sk_psock *psock) +static void __sk_psock_zap_ingress(struct sk_psock *psock) { struct sk_buff *skb; @@ -639,9 +639,7 @@ static void sk_psock_zap_ingress(struct sk_psock *psock) skb_bpf_redirect_clear(skb); kfree_skb(skb); } - spin_lock_bh(&psock->ingress_lock); __sk_psock_purge_ingress_msg(psock); - spin_unlock_bh(&psock->ingress_lock); } static void sk_psock_link_destroy(struct sk_psock *psock) @@ -654,6 +652,18 @@ static void sk_psock_link_destroy(struct sk_psock *psock) } } +void sk_psock_stop(struct sk_psock *psock, bool wait) +{ + spin_lock_bh(&psock->ingress_lock); + sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); + sk_psock_cork_free(psock); + __sk_psock_zap_ingress(psock); + spin_unlock_bh(&psock->ingress_lock); + + if (wait) + cancel_work_sync(&psock->work); +} + static void sk_psock_done_strp(struct sk_psock *psock); static void sk_psock_destroy_deferred(struct work_struct *gc) @@ -665,12 +675,12 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) sk_psock_done_strp(psock); cancel_work_sync(&psock->work); + mutex_destroy(&psock->work_mutex); psock_progs_drop(&psock->progs); sk_psock_link_destroy(psock); sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); if (psock->sk_redir) sock_put(psock->sk_redir); @@ -688,8 +698,7 @@ static void sk_psock_destroy(struct rcu_head *rcu) void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { - sk_psock_cork_free(psock); - sk_psock_zap_ingress(psock); + sk_psock_stop(psock, false); write_lock_bh(&sk->sk_callback_lock); sk_psock_restore_proto(sk, psock); @@ -699,7 +708,6 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) else if (psock->progs.stream_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - sk_psock_clear_state(psock, SK_PSOCK_TX_ENABLED); call_rcu(&psock->rcu, sk_psock_destroy); } @@ -770,14 +778,20 @@ static void sk_psock_skb_redirect(struct sk_buff *skb) * error that caused the pipe to break. We can't send a packet on * a socket that is in this state so we drop the skb. */ - if (!psock_other || sock_flag(sk_other, SOCK_DEAD) || - !sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + if (!psock_other || sock_flag(sk_other, SOCK_DEAD)) { + kfree_skb(skb); + return; + } + spin_lock_bh(&psock_other->ingress_lock); + if (!sk_psock_test_state(psock_other, SK_PSOCK_TX_ENABLED)) { + spin_unlock_bh(&psock_other->ingress_lock); kfree_skb(skb); return; } skb_queue_tail(&psock_other->ingress_skb, skb); schedule_work(&psock_other->work); + spin_unlock_bh(&psock_other->ingress_lock); } static void sk_psock_tls_verdict_apply(struct sk_buff *skb, struct sock *sk, int verdict) @@ -845,8 +859,12 @@ static void sk_psock_verdict_apply(struct sk_psock *psock, err = sk_psock_skb_ingress_self(psock, skb); } if (err < 0) { - skb_queue_tail(&psock->ingress_skb, skb); - schedule_work(&psock->work); + spin_lock_bh(&psock->ingress_lock); + if (sk_psock_test_state(psock, SK_PSOCK_TX_ENABLED)) { + skb_queue_tail(&psock->ingress_skb, skb); + schedule_work(&psock->work); + } + spin_unlock_bh(&psock->ingress_lock); } break; case __SK_REDIRECT: diff --git a/net/core/sock_map.c b/net/core/sock_map.c index dd53a7771d7e..e564fdeaada1 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1540,6 +1540,7 @@ void sock_map_close(struct sock *sk, long timeout) saved_close = psock->saved_close; sock_map_remove_links(sk, psock); rcu_read_unlock(); + sk_psock_stop(psock, true); release_sock(sk); saved_close(sk, timeout); } -- cgit v1.2.1 From 7786dfc41a74e0567557b5c4a28fc8482f5f5691 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:26 -0700 Subject: skmsg: Use rcu work for destroying psock The RCU callback sk_psock_destroy() only queues work psock->gc, so we can just switch to rcu work to simplify the code. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-6-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9c25020086a9..d43d43905d2c 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -666,10 +666,10 @@ void sk_psock_stop(struct sk_psock *psock, bool wait) static void sk_psock_done_strp(struct sk_psock *psock); -static void sk_psock_destroy_deferred(struct work_struct *gc) +static void sk_psock_destroy(struct work_struct *work) { - struct sk_psock *psock = container_of(gc, struct sk_psock, gc); - + struct sk_psock *psock = container_of(to_rcu_work(work), + struct sk_psock, rwork); /* No sk_callback_lock since already detached. */ sk_psock_done_strp(psock); @@ -688,14 +688,6 @@ static void sk_psock_destroy_deferred(struct work_struct *gc) kfree(psock); } -static void sk_psock_destroy(struct rcu_head *rcu) -{ - struct sk_psock *psock = container_of(rcu, struct sk_psock, rcu); - - INIT_WORK(&psock->gc, sk_psock_destroy_deferred); - schedule_work(&psock->gc); -} - void sk_psock_drop(struct sock *sk, struct sk_psock *psock) { sk_psock_stop(psock, false); @@ -709,7 +701,8 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); - call_rcu(&psock->rcu, sk_psock_destroy); + INIT_RCU_WORK(&psock->rwork, sk_psock_destroy); + queue_rcu_work(system_wq, &psock->rwork); } EXPORT_SYMBOL_GPL(sk_psock_drop); -- cgit v1.2.1 From 190179f65ba8bc18dc1d38435b7932505ca5544f Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:27 -0700 Subject: skmsg: Use GFP_KERNEL in sk_psock_create_ingress_msg() This function is only called in process context. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-7-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index d43d43905d2c..656eceab73bc 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -410,7 +410,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_ATOMIC); + msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); if (unlikely(!msg)) return NULL; -- cgit v1.2.1 From 2004fdbd8a2b56757691717639f86d0eea3ab5b4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:28 -0700 Subject: sock_map: Simplify sock_map_link() a bit sock_map_link() passes down map progs, but it is confusing to see both map progs and psock progs. Make the map progs more obvious by retrieving it directly with sock_map_progs() inside sock_map_link(). Now it is aligned with sock_map_link_no_progs() too. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-8-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index e564fdeaada1..d06face0f16c 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -26,6 +26,7 @@ struct bpf_stab { static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, struct bpf_prog *old, u32 which); +static struct sk_psock_progs *sock_map_progs(struct bpf_map *map); static struct bpf_map *sock_map_alloc(union bpf_attr *attr) { @@ -224,10 +225,10 @@ out: return psock; } -static int sock_map_link(struct bpf_map *map, struct sk_psock_progs *progs, - struct sock *sk) +static int sock_map_link(struct bpf_map *map, struct sock *sk) { struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; + struct sk_psock_progs *progs = sock_map_progs(map); struct sk_psock *psock; int ret; @@ -492,7 +493,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, * and sk_write_space callbacks overridden. */ if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, &stab->progs, sk); + ret = sock_map_link(map, sk); else ret = sock_map_link_no_progs(map, sk); if (ret < 0) @@ -1004,7 +1005,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, * and sk_write_space callbacks overridden. */ if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, &htab->progs, sk); + ret = sock_map_link(map, sk); else ret = sock_map_link_no_progs(map, sk); if (ret < 0) -- cgit v1.2.1 From b017055255d620b365299c3824610e0098414664 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:29 -0700 Subject: sock_map: Kill sock_map_link_no_progs() Now we can fold sock_map_link_no_progs() into sock_map_link() and get rid of sock_map_link_no_progs(). Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-9-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 55 +++++++++++++++-------------------------------------- 1 file changed, 15 insertions(+), 40 deletions(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index d06face0f16c..42d797291d34 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -225,13 +225,24 @@ out: return psock; } +static bool sock_map_redirect_allowed(const struct sock *sk); + static int sock_map_link(struct bpf_map *map, struct sock *sk) { - struct bpf_prog *msg_parser, *stream_parser, *stream_verdict; struct sk_psock_progs *progs = sock_map_progs(map); + struct bpf_prog *stream_verdict = NULL; + struct bpf_prog *stream_parser = NULL; + struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; + /* Only sockets we can redirect into/from in BPF need to hold + * refs to parser/verdict progs and have their sk_data_ready + * and sk_write_space callbacks overridden. + */ + if (!sock_map_redirect_allowed(sk)) + goto no_progs; + stream_verdict = READ_ONCE(progs->stream_verdict); if (stream_verdict) { stream_verdict = bpf_prog_inc_not_zero(stream_verdict); @@ -257,6 +268,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } +no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { ret = PTR_ERR(psock); @@ -316,27 +328,6 @@ out_put_stream_verdict: return ret; } -static int sock_map_link_no_progs(struct bpf_map *map, struct sock *sk) -{ - struct sk_psock *psock; - int ret; - - psock = sock_map_psock_get_checked(sk); - if (IS_ERR(psock)) - return PTR_ERR(psock); - - if (!psock) { - psock = sk_psock_init(sk, map->numa_node); - if (IS_ERR(psock)) - return PTR_ERR(psock); - } - - ret = sock_map_init_proto(sk, psock); - if (ret < 0) - sk_psock_put(sk, psock); - return ret; -} - static void sock_map_free(struct bpf_map *map) { struct bpf_stab *stab = container_of(map, struct bpf_stab, map); @@ -467,8 +458,6 @@ static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next) return 0; } -static bool sock_map_redirect_allowed(const struct sock *sk); - static int sock_map_update_common(struct bpf_map *map, u32 idx, struct sock *sk, u64 flags) { @@ -488,14 +477,7 @@ static int sock_map_update_common(struct bpf_map *map, u32 idx, if (!link) return -ENOMEM; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, sk); - else - ret = sock_map_link_no_progs(map, sk); + ret = sock_map_link(map, sk); if (ret < 0) goto out_free; @@ -1000,14 +982,7 @@ static int sock_hash_update_common(struct bpf_map *map, void *key, if (!link) return -ENOMEM; - /* Only sockets we can redirect into/from in BPF need to hold - * refs to parser/verdict progs and have their sk_data_ready - * and sk_write_space callbacks overridden. - */ - if (sock_map_redirect_allowed(sk)) - ret = sock_map_link(map, sk); - else - ret = sock_map_link_no_progs(map, sk); + ret = sock_map_link(map, sk); if (ret < 0) goto out_free; -- cgit v1.2.1 From a7ba4558e69a3c2ae4ca521f015832ef44799538 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:30 -0700 Subject: sock_map: Introduce BPF_SK_SKB_VERDICT Reusing BPF_SK_SKB_STREAM_VERDICT is possible but its name is confusing and more importantly we still want to distinguish them from user-space. So we can just reuse the stream verdict code but introduce a new type of eBPF program, skb_verdict. Users are not allowed to attach stream_verdict and skb_verdict programs to the same map. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-10-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 4 +++- net/core/sock_map.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 31 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 656eceab73bc..a045812d7c78 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -697,7 +697,7 @@ void sk_psock_drop(struct sock *sk, struct sk_psock *psock) rcu_assign_sk_user_data(sk, NULL); if (psock->progs.stream_parser) sk_psock_stop_strp(sk, psock); - else if (psock->progs.stream_verdict) + else if (psock->progs.stream_verdict || psock->progs.skb_verdict) sk_psock_stop_verdict(sk, psock); write_unlock_bh(&sk->sk_callback_lock); @@ -1024,6 +1024,8 @@ static int sk_psock_verdict_recv(read_descriptor_t *desc, struct sk_buff *skb, } skb_set_owner_r(skb, sk); prog = READ_ONCE(psock->progs.stream_verdict); + if (!prog) + prog = READ_ONCE(psock->progs.skb_verdict); if (likely(prog)) { skb_dst_drop(skb); skb_bpf_redirect_clear(skb); diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 42d797291d34..c2a0411e08a8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -156,6 +156,8 @@ static void sock_map_del_link(struct sock *sk, strp_stop = true; if (psock->saved_data_ready && stab->progs.stream_verdict) verdict_stop = true; + if (psock->saved_data_ready && stab->progs.skb_verdict) + verdict_stop = true; list_del(&link->list); sk_psock_free_link(link); } @@ -232,6 +234,7 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) struct sk_psock_progs *progs = sock_map_progs(map); struct bpf_prog *stream_verdict = NULL; struct bpf_prog *stream_parser = NULL; + struct bpf_prog *skb_verdict = NULL; struct bpf_prog *msg_parser = NULL; struct sk_psock *psock; int ret; @@ -268,6 +271,15 @@ static int sock_map_link(struct bpf_map *map, struct sock *sk) } } + skb_verdict = READ_ONCE(progs->skb_verdict); + if (skb_verdict) { + skb_verdict = bpf_prog_inc_not_zero(skb_verdict); + if (IS_ERR(skb_verdict)) { + ret = PTR_ERR(skb_verdict); + goto out_put_msg_parser; + } + } + no_progs: psock = sock_map_psock_get_checked(sk); if (IS_ERR(psock)) { @@ -278,6 +290,9 @@ no_progs: if (psock) { if ((msg_parser && READ_ONCE(psock->progs.msg_parser)) || (stream_parser && READ_ONCE(psock->progs.stream_parser)) || + (skb_verdict && READ_ONCE(psock->progs.skb_verdict)) || + (skb_verdict && READ_ONCE(psock->progs.stream_verdict)) || + (stream_verdict && READ_ONCE(psock->progs.skb_verdict)) || (stream_verdict && READ_ONCE(psock->progs.stream_verdict))) { sk_psock_put(sk, psock); ret = -EBUSY; @@ -309,6 +324,9 @@ no_progs: } else if (!stream_parser && stream_verdict && !psock->saved_data_ready) { psock_set_prog(&psock->progs.stream_verdict, stream_verdict); sk_psock_start_verdict(sk,psock); + } else if (!stream_verdict && skb_verdict && !psock->saved_data_ready) { + psock_set_prog(&psock->progs.skb_verdict, skb_verdict); + sk_psock_start_verdict(sk, psock); } write_unlock_bh(&sk->sk_callback_lock); return 0; @@ -317,6 +335,9 @@ out_unlock_drop: out_drop: sk_psock_put(sk, psock); out_progs: + if (skb_verdict) + bpf_prog_put(skb_verdict); +out_put_msg_parser: if (msg_parser) bpf_prog_put(msg_parser); out_put_stream_parser: @@ -1442,8 +1463,15 @@ static int sock_map_prog_update(struct bpf_map *map, struct bpf_prog *prog, break; #endif case BPF_SK_SKB_STREAM_VERDICT: + if (progs->skb_verdict) + return -EBUSY; pprog = &progs->stream_verdict; break; + case BPF_SK_SKB_VERDICT: + if (progs->stream_verdict) + return -EBUSY; + pprog = &progs->skb_verdict; + break; default: return -EOPNOTSUPP; } -- cgit v1.2.1 From 8a59f9d1e3d4340659fdfee8879dc09a6f2546e1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:31 -0700 Subject: sock: Introduce sk->sk_prot->psock_update_sk_prot() Currently sockmap calls into each protocol to update the struct proto and replace it. This certainly won't work when the protocol is implemented as a module, for example, AF_UNIX. Introduce a new ops sk->sk_prot->psock_update_sk_prot(), so each protocol can implement its own way to replace the struct proto. This also helps get rid of symbol dependencies on CONFIG_INET. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-11-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 5 ----- net/core/sock_map.c | 24 ++++-------------------- 2 files changed, 4 insertions(+), 25 deletions(-) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index a045812d7c78..9fc83f7cc1a0 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -562,11 +562,6 @@ struct sk_psock *sk_psock_init(struct sock *sk, int node) write_lock_bh(&sk->sk_callback_lock); - if (inet_csk_has_ulp(sk)) { - psock = ERR_PTR(-EINVAL); - goto out; - } - if (sk->sk_user_data) { psock = ERR_PTR(-EBUSY); goto out; diff --git a/net/core/sock_map.c b/net/core/sock_map.c index c2a0411e08a8..2915c7c8778b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -185,26 +185,10 @@ static void sock_map_unref(struct sock *sk, void *link_raw) static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) { - struct proto *prot; - - switch (sk->sk_type) { - case SOCK_STREAM: - prot = tcp_bpf_get_proto(sk, psock); - break; - - case SOCK_DGRAM: - prot = udp_bpf_get_proto(sk, psock); - break; - - default: + if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; - } - - if (IS_ERR(prot)) - return PTR_ERR(prot); - - sk_psock_update_proto(sk, psock, prot); - return 0; + psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; + return sk->sk_prot->psock_update_sk_prot(sk, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) @@ -556,7 +540,7 @@ static bool sock_map_redirect_allowed(const struct sock *sk) static bool sock_map_sk_is_suitable(const struct sock *sk) { - return sk_is_tcp(sk) || sk_is_udp(sk); + return !!sk->sk_prot->psock_update_sk_prot; } static bool sock_map_sk_state_allowed(const struct sock *sk) -- cgit v1.2.1 From 2bc793e3272a13e337416c057cb81c5396ad91d1 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:33 -0700 Subject: skmsg: Extract __tcp_bpf_recvmsg() and tcp_bpf_wait_data() Although these two functions are only used by TCP, they are not specific to TCP at all, both operate on skmsg and ingress_msg, so fit in net/core/skmsg.c very well. And we will need them for non-TCP, so rename and move them to skmsg.c and export them to modules. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Link: https://lore.kernel.org/bpf/20210331023237.41094-13-xiyou.wangcong@gmail.com --- net/core/skmsg.c | 98 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) (limited to 'net/core') diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 9fc83f7cc1a0..92a83c02562a 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -399,6 +399,104 @@ out: } EXPORT_SYMBOL_GPL(sk_msg_memcopy_from_iter); +int sk_msg_wait_data(struct sock *sk, struct sk_psock *psock, int flags, + long timeo, int *err) +{ + DEFINE_WAIT_FUNC(wait, woken_wake_function); + int ret = 0; + + if (sk->sk_shutdown & RCV_SHUTDOWN) + return 1; + + if (!timeo) + return ret; + + add_wait_queue(sk_sleep(sk), &wait); + sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); + ret = sk_wait_event(sk, &timeo, + !list_empty(&psock->ingress_msg) || + !skb_queue_empty(&sk->sk_receive_queue), &wait); + sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk); + remove_wait_queue(sk_sleep(sk), &wait); + return ret; +} +EXPORT_SYMBOL_GPL(sk_msg_wait_data); + +/* Receive sk_msg from psock->ingress_msg to @msg. */ +int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, + int len, int flags) +{ + struct iov_iter *iter = &msg->msg_iter; + int peek = flags & MSG_PEEK; + struct sk_msg *msg_rx; + int i, copied = 0; + + msg_rx = sk_psock_peek_msg(psock); + while (copied != len) { + struct scatterlist *sge; + + if (unlikely(!msg_rx)) + break; + + i = msg_rx->sg.start; + do { + struct page *page; + int copy; + + sge = sk_msg_elem(msg_rx, i); + copy = sge->length; + page = sg_page(sge); + if (copied + copy > len) + copy = len - copied; + copy = copy_page_to_iter(page, sge->offset, copy, iter); + if (!copy) + return copied ? copied : -EFAULT; + + copied += copy; + if (likely(!peek)) { + sge->offset += copy; + sge->length -= copy; + if (!msg_rx->skb) + sk_mem_uncharge(sk, copy); + msg_rx->sg.size -= copy; + + if (!sge->length) { + sk_msg_iter_var_next(i); + if (!msg_rx->skb) + put_page(page); + } + } else { + /* Lets not optimize peek case if copy_page_to_iter + * didn't copy the entire length lets just break. + */ + if (copy != sge->length) + return copied; + sk_msg_iter_var_next(i); + } + + if (copied == len) + break; + } while (i != msg_rx->sg.end); + + if (unlikely(peek)) { + msg_rx = sk_psock_next_msg(psock, msg_rx); + if (!msg_rx) + break; + continue; + } + + msg_rx->sg.start = i; + if (!sge->length && msg_rx->sg.start == msg_rx->sg.end) { + msg_rx = sk_psock_dequeue_msg(psock); + kfree_sk_msg(msg_rx); + } + msg_rx = sk_psock_peek_msg(psock); + } + + return copied; +} +EXPORT_SYMBOL_GPL(sk_msg_recvmsg); + static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, struct sk_buff *skb) { -- cgit v1.2.1 From 122e6c79efe1c25816118aca9cfabe54e99c2432 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 30 Mar 2021 19:32:35 -0700 Subject: sock_map: Update sock type checks for UDP Now UDP supports sockmap and redirection, we can safely update the sock type checks for it accordingly. Signed-off-by: Cong Wang Signed-off-by: Alexei Starovoitov Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210331023237.41094-15-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 2915c7c8778b..3d190d22b0d8 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -535,7 +535,10 @@ static bool sk_is_udp(const struct sock *sk) static bool sock_map_redirect_allowed(const struct sock *sk) { - return sk_is_tcp(sk) && sk->sk_state != TCP_LISTEN; + if (sk_is_tcp(sk)) + return sk->sk_state != TCP_LISTEN; + else + return sk->sk_state == TCP_ESTABLISHED; } static bool sock_map_sk_is_suitable(const struct sock *sk) -- cgit v1.2.1 From eeb85a14ee3494febb85ccfbee0772eda0823b13 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Mon, 5 Apr 2021 00:12:23 -0700 Subject: net: Allow to specify ifindex when device is moved to another namespace Currently, we can specify ifindex on link creation. This change allows to specify ifindex when a device is moved to another network namespace. Even now, a device ifindex can be changed if there is another device with the same ifindex in the target namespace. So this change doesn't introduce completely new behavior, it adds more control to the process. CRIU users want to restore containers with pre-created network devices. A user will provide network devices and instructions where they have to be restored, then CRIU will restore network namespaces and move devices into them. The problem is that devices have to be restored with the same indexes that they have before C/R. Cc: Alexander Mikhalitsyn Suggested-by: Christian Brauner Signed-off-by: Andrei Vagin Reviewed-by: Christian Brauner Signed-off-by: David S. Miller --- net/core/dev.c | 24 +++++++++++++++++------- net/core/rtnetlink.c | 19 +++++++++++++++---- 2 files changed, 32 insertions(+), 11 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b4c67a5be606..9d1a8fac793f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11067,6 +11067,8 @@ EXPORT_SYMBOL(unregister_netdev); * @net: network namespace * @pat: If not NULL name pattern to try if the current device name * is already taken in the destination network namespace. + * @new_ifindex: If not zero, specifies device index in the target + * namespace. * * This function shuts down a device interface and moves it * to a new network namespace. On success 0 is returned, on @@ -11075,10 +11077,11 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) +int dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) { struct net *net_old = dev_net(dev); - int err, new_nsid, new_ifindex; + int err, new_nsid; ASSERT_RTNL(); @@ -11109,6 +11112,11 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char goto out; } + /* Check that new_ifindex isn't used yet. */ + err = -EBUSY; + if (new_ifindex && __dev_get_by_index(net, new_ifindex)) + goto out; + /* * And now a mini version of register_netdevice unregister_netdevice. */ @@ -11136,10 +11144,12 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char new_nsid = peernet2id_alloc(dev_net(dev), net, GFP_KERNEL); /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - new_ifindex = dev_new_index(net); - else - new_ifindex = dev->ifindex; + if (!new_ifindex) { + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); + else + new_ifindex = dev->ifindex; + } rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, new_ifindex); @@ -11448,7 +11458,7 @@ static void __net_exit default_device_exit(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (__dev_get_by_name(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); - err = dev_change_net_namespace(dev, &init_net, fb_name); + err = dev_change_net_namespace(dev, &init_net, fb_name, 0); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 1bdcb33fb561..d51252afde0a 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2266,6 +2266,9 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EINVAL; } + if (tb[IFLA_NEW_IFINDEX] && nla_get_s32(tb[IFLA_NEW_IFINDEX]) <= 0) + return -EINVAL; + if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; @@ -2603,14 +2606,22 @@ static int do_setlink(const struct sk_buff *skb, return err; if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_TARGET_NETNSID]) { - struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), - tb, CAP_NET_ADMIN); + struct net *net; + int new_ifindex; + + net = rtnl_link_get_net_capable(skb, dev_net(dev), + tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } - err = dev_change_net_namespace(dev, net, ifname); + if (tb[IFLA_NEW_IFINDEX]) + new_ifindex = nla_get_s32(tb[IFLA_NEW_IFINDEX]); + else + new_ifindex = 0; + + err = dev_change_net_namespace(dev, net, ifname, new_ifindex); put_net(net); if (err) goto errout; @@ -3452,7 +3463,7 @@ replay: if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname); + err = dev_change_net_namespace(dev, dest_net, ifname, 0); if (err < 0) goto out_unregister; } -- cgit v1.2.1 From 7e4a51319d3a71ac8002c96f817bcbeb36789b07 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 Apr 2021 23:40:03 -0700 Subject: net: introduce nla_policy for IFLA_NEW_IFINDEX In this case, we don't need to check that new_ifindex is positive in validate_linkmsg. Fixes: eeb85a14ee34 ("net: Allow to specify ifindex when device is moved to another namespace") Suggested-by: Jakub Kicinski Signed-off-by: Andrei Vagin Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index d51252afde0a..9108a7e6c0c0 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1877,6 +1877,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { .len = ALTIFNAMSIZ - 1 }, [IFLA_PERM_ADDRESS] = { .type = NLA_REJECT }, [IFLA_PROTO_DOWN_REASON] = { .type = NLA_NESTED }, + [IFLA_NEW_IFINDEX] = NLA_POLICY_MIN(NLA_S32, 1), }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2266,9 +2267,6 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) return -EINVAL; } - if (tb[IFLA_NEW_IFINDEX] && nla_get_s32(tb[IFLA_NEW_IFINDEX]) <= 0) - return -EINVAL; - if (tb[IFLA_AF_SPEC]) { struct nlattr *af; int rem, err; -- cgit v1.2.1 From 0854fa82c96ca37a35e954b7079c0bfd795affb1 Mon Sep 17 00:00:00 2001 From: Andrei Vagin Date: Tue, 6 Apr 2021 23:40:51 -0700 Subject: net: remove the new_ifindex argument from dev_change_net_namespace Here is only one place where we want to specify new_ifindex. In all other cases, callers pass 0 as new_ifindex. It looks reasonable to add a low-level function with new_ifindex and to convert dev_change_net_namespace to a static inline wrapper. Fixes: eeb85a14ee34 ("net: Allow to specify ifindex when device is moved to another namespace") Suggested-by: Jakub Kicinski Signed-off-by: Andrei Vagin Acked-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/dev.c | 10 +++++----- net/core/rtnetlink.c | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 9d1a8fac793f..33ff4a944109 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -11062,7 +11062,7 @@ void unregister_netdev(struct net_device *dev) EXPORT_SYMBOL(unregister_netdev); /** - * dev_change_net_namespace - move device to different nethost namespace + * __dev_change_net_namespace - move device to different nethost namespace * @dev: device * @net: network namespace * @pat: If not NULL name pattern to try if the current device name @@ -11077,8 +11077,8 @@ EXPORT_SYMBOL(unregister_netdev); * Callers must hold the rtnl semaphore. */ -int dev_change_net_namespace(struct net_device *dev, struct net *net, - const char *pat, int new_ifindex) +int __dev_change_net_namespace(struct net_device *dev, struct net *net, + const char *pat, int new_ifindex) { struct net *net_old = dev_net(dev); int err, new_nsid; @@ -11202,7 +11202,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, out: return err; } -EXPORT_SYMBOL_GPL(dev_change_net_namespace); +EXPORT_SYMBOL_GPL(__dev_change_net_namespace); static int dev_cpu_dead(unsigned int oldcpu) { @@ -11458,7 +11458,7 @@ static void __net_exit default_device_exit(struct net *net) snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); if (__dev_get_by_name(&init_net, fb_name)) snprintf(fb_name, IFNAMSIZ, "dev%%d"); - err = dev_change_net_namespace(dev, &init_net, fb_name, 0); + err = dev_change_net_namespace(dev, &init_net, fb_name); if (err) { pr_emerg("%s: failed to move %s to init_net: %d\n", __func__, dev->name, err); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 9108a7e6c0c0..9f1f55785a6f 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -2619,7 +2619,7 @@ static int do_setlink(const struct sk_buff *skb, else new_ifindex = 0; - err = dev_change_net_namespace(dev, net, ifname, new_ifindex); + err = __dev_change_net_namespace(dev, net, ifname, new_ifindex); put_net(net); if (err) goto errout; @@ -3461,7 +3461,7 @@ replay: if (err < 0) goto out_unregister; if (link_net) { - err = dev_change_net_namespace(dev, dest_net, ifname, 0); + err = dev_change_net_namespace(dev, dest_net, ifname); if (err < 0) goto out_unregister; } -- cgit v1.2.1 From 51e0158a54321a48d260e95998393934bb0de52c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 6 Apr 2021 20:21:11 -0700 Subject: skmsg: Pass psock pointer to ->psock_update_sk_prot() Using sk_psock() to retrieve psock pointer from sock requires RCU read lock, but we already get psock pointer before calling ->psock_update_sk_prot() in both cases, so we can just pass it without bothering sk_psock(). Fixes: 8a59f9d1e3d4 ("sock: Introduce sk->sk_prot->psock_update_sk_prot()") Reported-by: syzbot+320a3bc8d80f478c37e4@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Tested-by: syzbot+320a3bc8d80f478c37e4@syzkaller.appspotmail.com Reviewed-by: Jakub Sitnicki Acked-by: John Fastabend Link: https://lore.kernel.org/bpf/20210407032111.33398-1-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index 3d190d22b0d8..f473c51cbc4b 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -188,7 +188,7 @@ static int sock_map_init_proto(struct sock *sk, struct sk_psock *psock) if (!sk->sk_prot->psock_update_sk_prot) return -EINVAL; psock->psock_update_sk_prot = sk->sk_prot->psock_update_sk_prot; - return sk->sk_prot->psock_update_sk_prot(sk, false); + return sk->sk_prot->psock_update_sk_prot(sk, psock, false); } static struct sk_psock *sock_map_psock_get_checked(struct sock *sk) -- cgit v1.2.1 From aadb2bb83ff789de63b48b4edeab7329423a50d3 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 7 Apr 2021 20:05:56 -0700 Subject: sock_map: Fix a potential use-after-free in sock_map_close() The last refcnt of the psock can be gone right after sock_map_remove_links(), so sk_psock_stop() could trigger a UAF. The reason why I placed sk_psock_stop() there is to avoid RCU read critical section, and more importantly, some callee of sock_map_remove_links() is supposed to be called with RCU read lock, we can not simply get rid of RCU read lock here. Therefore, the only choice we have is to grab an additional refcnt with sk_psock_get() and put it back after sk_psock_stop(). Fixes: 799aa7f98d53 ("skmsg: Avoid lock_sock() in sk_psock_backlog()") Reported-by: syzbot+7b6548ae483d6f4c64ae@syzkaller.appspotmail.com Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Acked-by: John Fastabend Acked-by: Jakub Sitnicki Link: https://lore.kernel.org/bpf/20210408030556.45134-1-xiyou.wangcong@gmail.com --- net/core/sock_map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock_map.c b/net/core/sock_map.c index f473c51cbc4b..6f1b82b8ad49 100644 --- a/net/core/sock_map.c +++ b/net/core/sock_map.c @@ -1521,7 +1521,7 @@ void sock_map_close(struct sock *sk, long timeout) lock_sock(sk); rcu_read_lock(); - psock = sk_psock(sk); + psock = sk_psock_get(sk); if (unlikely(!psock)) { rcu_read_unlock(); release_sock(sk); @@ -1532,6 +1532,7 @@ void sock_map_close(struct sock *sk, long timeout) sock_map_remove_links(sk, psock); rcu_read_unlock(); sk_psock_stop(psock, true); + sk_psock_put(sk, psock); release_sock(sk); saved_close(sk, timeout); } -- cgit v1.2.1 From 17c3df7078e3742bd9e907f3006a9e3469383007 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Wed, 14 Apr 2021 12:48:48 +0200 Subject: skbuff: revert "skbuff: remove some unnecessary operation in skb_segment_list()" the commit 1ddc3229ad3c ("skbuff: remove some unnecessary operation in skb_segment_list()") introduces an issue very similar to the one already fixed by commit 53475c5dd856 ("net: fix use-after-free when UDP GRO with shared fraglist"). If the GSO skb goes though skb_clone() and pskb_expand_head() before entering skb_segment_list(), the latter will unshare the frag_list skbs and will release the old list. With the reverted commit in place, when skb_segment_list() completes, skb->next points to the just released list, and later on the kernel will hit UaF. Note that since commit e0e3070a9bc9 ("udp: properly complete L4 GRO over UDP tunnel packet") the critical scenario can be reproduced also receiving UDP over vxlan traffic with: NIC (NETIF_F_GRO_FRAGLIST enabled) -> vxlan -> UDP sink Attaching a packet socket to the NIC will cause skb_clone() and the tunnel decapsulation will call pskb_expand_head(). Fixes: 1ddc3229ad3c ("skbuff: remove some unnecessary operation in skb_segment_list()") Signed-off-by: Paolo Abeni Signed-off-by: David S. Miller --- net/core/skbuff.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'net/core') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3ad9e8425ab2..14010c0eec48 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3773,13 +3773,13 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, unsigned int tnl_hlen = skb_tnl_header_len(skb); unsigned int delta_truesize = 0; unsigned int delta_len = 0; + struct sk_buff *tail = NULL; struct sk_buff *nskb, *tmp; int err; skb_push(skb, -skb_network_offset(skb) + offset); skb_shinfo(skb)->frag_list = NULL; - skb->next = list_skb; do { nskb = list_skb; @@ -3797,8 +3797,17 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, } } - if (unlikely(err)) + if (!tail) + skb->next = nskb; + else + tail->next = nskb; + + if (unlikely(err)) { + nskb->next = list_skb; goto err_linearize; + } + + tail = nskb; delta_len += nskb->len; delta_truesize += nskb->truesize; @@ -3825,7 +3834,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_gso_reset(skb); - skb->prev = nskb; + skb->prev = tail; if (skb_needs_linearize(skb, features) && __skb_linearize(skb)) -- cgit v1.2.1 From 38ebcf5096a86762b82262e96b2c8b170fe79040 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 15 Apr 2021 10:37:53 -0700 Subject: scm: optimize put_cmsg() Calling two copy_to_user() for very small regions has very high overhead. Switch to inlined unsafe_put_user() to save one stac/clac sequence, and avoid copy_to_user(). Signed-off-by: Eric Dumazet Cc: Soheil Hassas Yeganeh Acked-by: Soheil Hassas Yeganeh Signed-off-by: David S. Miller --- net/core/scm.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index 8156d4fb8a39..bd96c922041d 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -228,14 +228,16 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) if (msg->msg_control_is_user) { struct cmsghdr __user *cm = msg->msg_control_user; - struct cmsghdr cmhdr; - - cmhdr.cmsg_level = level; - cmhdr.cmsg_type = type; - cmhdr.cmsg_len = cmlen; - if (copy_to_user(cm, &cmhdr, sizeof cmhdr) || - copy_to_user(CMSG_USER_DATA(cm), data, cmlen - sizeof(*cm))) - return -EFAULT; + + if (!user_write_access_begin(cm, cmlen)) + goto efault; + + unsafe_put_user(len, &cm->cmsg_len, efault_end); + unsafe_put_user(level, &cm->cmsg_level, efault_end); + unsafe_put_user(type, &cm->cmsg_type, efault_end); + unsafe_copy_to_user(CMSG_USER_DATA(cm), data, + cmlen - sizeof(*cm), efault_end); + user_write_access_end(); } else { struct cmsghdr *cm = msg->msg_control; @@ -249,6 +251,11 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) msg->msg_control += cmlen; msg->msg_controllen -= cmlen; return 0; + +efault_end: + user_write_access_end(); +efault: + return -EFAULT; } EXPORT_SYMBOL(put_cmsg); -- cgit v1.2.1 From e7ad33fa7bc5f788cdb14eea68c65c4da0f06edf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 16 Apr 2021 11:35:38 -0700 Subject: scm: fix a typo in put_cmsg() We need to store cmlen instead of len in cm->cmsg_len. Fixes: 38ebcf5096a8 ("scm: optimize put_cmsg()") Signed-off-by: Eric Dumazet Reported-by: Jakub Kicinski Signed-off-by: David S. Miller --- net/core/scm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/scm.c b/net/core/scm.c index bd96c922041d..ae3085d9aae8 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -232,7 +232,7 @@ int put_cmsg(struct msghdr * msg, int level, int type, int len, void *data) if (!user_write_access_begin(cm, cmlen)) goto efault; - unsafe_put_user(len, &cm->cmsg_len, efault_end); + unsafe_put_user(cmlen, &cm->cmsg_len, efault_end); unsafe_put_user(level, &cm->cmsg_level, efault_end); unsafe_put_user(type, &cm->cmsg_type, efault_end); unsafe_copy_to_user(CMSG_USER_DATA(cm), data, -- cgit v1.2.1 From 1e3d976dbb23b3fce544752b434bdc32ce64aabc Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Fri, 16 Apr 2021 14:31:51 -0500 Subject: flow_dissector: Fix out-of-bounds warning in __skb_flow_bpf_to_target() Fix the following out-of-bounds warning: net/core/flow_dissector.c:835:3: warning: 'memcpy' offset [33, 48] from the object at 'flow_keys' is out of the bounds of referenced subobject 'ipv6_src' with type '__u32[4]' {aka 'unsigned int[4]'} at offset 16 [-Warray-bounds] The problem is that the original code is trying to copy data into a couple of struct members adjacent to each other in a single call to memcpy(). So, the compiler legitimately complains about it. As these are just a couple of members, fix this by copying each one of them in separate calls to memcpy(). This helps with the ongoing efforts to globally enable -Warray-bounds and get us closer to being able to tighten the FORTIFY_SOURCE routines on memcpy(). Link: https://github.com/KSPP/linux/issues/109 Reported-by: kernel test robot Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller --- net/core/flow_dissector.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net/core') diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 5985029e43d4..3ed7c98a98e1 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -832,8 +832,10 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, key_addrs = skb_flow_dissector_target(flow_dissector, FLOW_DISSECTOR_KEY_IPV6_ADDRS, target_container); - memcpy(&key_addrs->v6addrs, &flow_keys->ipv6_src, - sizeof(key_addrs->v6addrs)); + memcpy(&key_addrs->v6addrs.src, &flow_keys->ipv6_src, + sizeof(key_addrs->v6addrs.src)); + memcpy(&key_addrs->v6addrs.dst, &flow_keys->ipv6_dst, + sizeof(key_addrs->v6addrs.dst)); key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS; } -- cgit v1.2.1 From 7ad18ff6449cbd6beb26b53128ddf56d2685aa93 Mon Sep 17 00:00:00 2001 From: Alexander Lobakin Date: Mon, 19 Apr 2021 12:53:06 +0000 Subject: gro: fix napi_gro_frags() Fast GRO breakage due to IP alignment check Commit 38ec4944b593 ("gro: ensure frag0 meets IP header alignment") did the right thing, but missed the fact that napi_gro_frags() logics calls for skb_gro_reset_offset() *before* pulling Ethernet header to the skb linear space. That said, the introduced check for frag0 address being aligned to 4 always fails for it as Ethernet header is obviously 14 bytes long, and in case with NET_IP_ALIGN its start is not aligned to 4. Fix this by adding @nhoff argument to skb_gro_reset_offset() which tells if an IP header is placed right at the start of frag0 or not. This restores Fast GRO for napi_gro_frags() that became very slow after the mentioned commit, and preserves the introduced check to avoid silent unaligned accesses. From v1 [0]: - inline tiny skb_gro_reset_offset() to let the code be optimized more efficively (esp. for the !NET_IP_ALIGN case) (Eric); - pull in Reviewed-by from Eric. [0] https://lore.kernel.org/netdev/20210418114200.5839-1-alobakin@pm.me Fixes: 38ec4944b593 ("gro: ensure frag0 meets IP header alignment") Reviewed-by: Eric Dumazet Signed-off-by: Alexander Lobakin Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index 1f79b9aa9a3f..15fe36332fb8 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5914,7 +5914,7 @@ static struct list_head *gro_list_prepare(struct napi_struct *napi, return head; } -static void skb_gro_reset_offset(struct sk_buff *skb) +static inline void skb_gro_reset_offset(struct sk_buff *skb, u32 nhoff) { const struct skb_shared_info *pinfo = skb_shinfo(skb); const skb_frag_t *frag0 = &pinfo->frags[0]; @@ -5925,7 +5925,7 @@ static void skb_gro_reset_offset(struct sk_buff *skb) if (!skb_headlen(skb) && pinfo->nr_frags && !PageHighMem(skb_frag_page(frag0)) && - (!NET_IP_ALIGN || !(skb_frag_off(frag0) & 3))) { + (!NET_IP_ALIGN || !((skb_frag_off(frag0) + nhoff) & 3))) { NAPI_GRO_CB(skb)->frag0 = skb_frag_address(frag0); NAPI_GRO_CB(skb)->frag0_len = min_t(unsigned int, skb_frag_size(frag0), @@ -6143,7 +6143,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) skb_mark_napi_id(skb, napi); trace_napi_gro_receive_entry(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, 0); ret = napi_skb_finish(napi, skb, dev_gro_receive(napi, skb)); trace_napi_gro_receive_exit(ret); @@ -6232,7 +6232,7 @@ static struct sk_buff *napi_frags_skb(struct napi_struct *napi) napi->skb = NULL; skb_reset_mac_header(skb); - skb_gro_reset_offset(skb); + skb_gro_reset_offset(skb, hlen); if (unlikely(skb_gro_header_hard(skb, hlen))) { eth = skb_gro_header_slow(skb, hlen, 0); -- cgit v1.2.1 From 3e1e58d64c3d0a6789f9d865936c4ce46b20f3f5 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Mon, 19 Apr 2021 15:01:03 +0200 Subject: net: add generic selftest support Port some parts of the stmmac selftest and reuse it as basic generic selftest library. This patch was tested with following combinations: - iMX6DL FEC -> AT8035 - iMX6DL FEC -> SJA1105Q switch -> KSZ8081 - iMX6DL FEC -> SJA1105Q switch -> KSZ9031 - AR9331 ag71xx -> AR9331 PHY - AR9331 ag71xx -> AR9331 switch -> AR9331 PHY Signed-off-by: Oleksij Rempel Signed-off-by: David S. Miller --- net/core/Makefile | 1 + net/core/selftests.c | 400 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 401 insertions(+) create mode 100644 net/core/selftests.c (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 0c2233c826fd..1a6168d8f23b 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -33,6 +33,7 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o ifeq ($(CONFIG_INET),y) +obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o endif diff --git a/net/core/selftests.c b/net/core/selftests.c new file mode 100644 index 000000000000..ba7b0171974c --- /dev/null +++ b/net/core/selftests.c @@ -0,0 +1,400 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2019 Synopsys, Inc. and/or its affiliates. + * stmmac Selftests Support + * + * Author: Jose Abreu + * + * Ported from stmmac by: + * Copyright (C) 2021 Oleksij Rempel + */ + +#include +#include +#include +#include + +struct net_packet_attrs { + unsigned char *src; + unsigned char *dst; + u32 ip_src; + u32 ip_dst; + bool tcp; + u16 sport; + u16 dport; + int timeout; + int size; + int max_size; + u8 id; + u16 queue_mapping; +}; + +struct net_test_priv { + struct net_packet_attrs *packet; + struct packet_type pt; + struct completion comp; + int double_vlan; + int vlan_id; + int ok; +}; + +struct netsfhdr { + __be32 version; + __be64 magic; + u8 id; +} __packed; + +static u8 net_test_next_id; + +#define NET_TEST_PKT_SIZE (sizeof(struct ethhdr) + sizeof(struct iphdr) + \ + sizeof(struct netsfhdr)) +#define NET_TEST_PKT_MAGIC 0xdeadcafecafedeadULL +#define NET_LB_TIMEOUT msecs_to_jiffies(200) + +static struct sk_buff *net_test_get_skb(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct sk_buff *skb = NULL; + struct udphdr *uhdr = NULL; + struct tcphdr *thdr = NULL; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct iphdr *ihdr; + int iplen, size; + + size = attr->size + NET_TEST_PKT_SIZE; + + if (attr->tcp) + size += sizeof(struct tcphdr); + else + size += sizeof(struct udphdr); + + if (attr->max_size && attr->max_size > size) + size = attr->max_size; + + skb = netdev_alloc_skb(ndev, size); + if (!skb) + return NULL; + + prefetchw(skb->data); + + ehdr = skb_push(skb, ETH_HLEN); + skb_reset_mac_header(skb); + + skb_set_network_header(skb, skb->len); + ihdr = skb_put(skb, sizeof(*ihdr)); + + skb_set_transport_header(skb, skb->len); + if (attr->tcp) + thdr = skb_put(skb, sizeof(*thdr)); + else + uhdr = skb_put(skb, sizeof(*uhdr)); + + eth_zero_addr(ehdr->h_dest); + + if (attr->src) + ether_addr_copy(ehdr->h_source, attr->src); + if (attr->dst) + ether_addr_copy(ehdr->h_dest, attr->dst); + + ehdr->h_proto = htons(ETH_P_IP); + + if (attr->tcp) { + thdr->source = htons(attr->sport); + thdr->dest = htons(attr->dport); + thdr->doff = sizeof(struct tcphdr) / 4; + thdr->check = 0; + } else { + uhdr->source = htons(attr->sport); + uhdr->dest = htons(attr->dport); + uhdr->len = htons(sizeof(*shdr) + sizeof(*uhdr) + attr->size); + if (attr->max_size) + uhdr->len = htons(attr->max_size - + (sizeof(*ihdr) + sizeof(*ehdr))); + uhdr->check = 0; + } + + ihdr->ihl = 5; + ihdr->ttl = 32; + ihdr->version = 4; + if (attr->tcp) + ihdr->protocol = IPPROTO_TCP; + else + ihdr->protocol = IPPROTO_UDP; + iplen = sizeof(*ihdr) + sizeof(*shdr) + attr->size; + if (attr->tcp) + iplen += sizeof(*thdr); + else + iplen += sizeof(*uhdr); + + if (attr->max_size) + iplen = attr->max_size - sizeof(*ehdr); + + ihdr->tot_len = htons(iplen); + ihdr->frag_off = 0; + ihdr->saddr = htonl(attr->ip_src); + ihdr->daddr = htonl(attr->ip_dst); + ihdr->tos = 0; + ihdr->id = 0; + ip_send_check(ihdr); + + shdr = skb_put(skb, sizeof(*shdr)); + shdr->version = 0; + shdr->magic = cpu_to_be64(NET_TEST_PKT_MAGIC); + attr->id = net_test_next_id; + shdr->id = net_test_next_id++; + + if (attr->size) + skb_put(skb, attr->size); + if (attr->max_size && attr->max_size > skb->len) + skb_put(skb, attr->max_size - skb->len); + + skb->csum = 0; + skb->ip_summed = CHECKSUM_PARTIAL; + if (attr->tcp) { + thdr->check = ~tcp_v4_check(skb->len, ihdr->saddr, + ihdr->daddr, 0); + skb->csum_start = skb_transport_header(skb) - skb->head; + skb->csum_offset = offsetof(struct tcphdr, check); + } else { + udp4_hwcsum(skb, ihdr->saddr, ihdr->daddr); + } + + skb->protocol = htons(ETH_P_IP); + skb->pkt_type = PACKET_HOST; + skb->dev = ndev; + + return skb; +} + +static int net_test_loopback_validate(struct sk_buff *skb, + struct net_device *ndev, + struct packet_type *pt, + struct net_device *orig_ndev) +{ + struct net_test_priv *tpriv = pt->af_packet_priv; + unsigned char *src = tpriv->packet->src; + unsigned char *dst = tpriv->packet->dst; + struct netsfhdr *shdr; + struct ethhdr *ehdr; + struct udphdr *uhdr; + struct tcphdr *thdr; + struct iphdr *ihdr; + + skb = skb_unshare(skb, GFP_ATOMIC); + if (!skb) + goto out; + + if (skb_linearize(skb)) + goto out; + if (skb_headlen(skb) < (NET_TEST_PKT_SIZE - ETH_HLEN)) + goto out; + + ehdr = (struct ethhdr *)skb_mac_header(skb); + if (dst) { + if (!ether_addr_equal_unaligned(ehdr->h_dest, dst)) + goto out; + } + + if (src) { + if (!ether_addr_equal_unaligned(ehdr->h_source, src)) + goto out; + } + + ihdr = ip_hdr(skb); + if (tpriv->double_vlan) + ihdr = (struct iphdr *)(skb_network_header(skb) + 4); + + if (tpriv->packet->tcp) { + if (ihdr->protocol != IPPROTO_TCP) + goto out; + + thdr = (struct tcphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (thdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)thdr + sizeof(*thdr)); + } else { + if (ihdr->protocol != IPPROTO_UDP) + goto out; + + uhdr = (struct udphdr *)((u8 *)ihdr + 4 * ihdr->ihl); + if (uhdr->dest != htons(tpriv->packet->dport)) + goto out; + + shdr = (struct netsfhdr *)((u8 *)uhdr + sizeof(*uhdr)); + } + + if (shdr->magic != cpu_to_be64(NET_TEST_PKT_MAGIC)) + goto out; + if (tpriv->packet->id != shdr->id) + goto out; + + tpriv->ok = true; + complete(&tpriv->comp); +out: + kfree_skb(skb); + return 0; +} + +static int __net_test_loopback(struct net_device *ndev, + struct net_packet_attrs *attr) +{ + struct net_test_priv *tpriv; + struct sk_buff *skb = NULL; + int ret = 0; + + tpriv = kzalloc(sizeof(*tpriv), GFP_KERNEL); + if (!tpriv) + return -ENOMEM; + + tpriv->ok = false; + init_completion(&tpriv->comp); + + tpriv->pt.type = htons(ETH_P_IP); + tpriv->pt.func = net_test_loopback_validate; + tpriv->pt.dev = ndev; + tpriv->pt.af_packet_priv = tpriv; + tpriv->packet = attr; + dev_add_pack(&tpriv->pt); + + skb = net_test_get_skb(ndev, attr); + if (!skb) { + ret = -ENOMEM; + goto cleanup; + } + + ret = dev_direct_xmit(skb, attr->queue_mapping); + if (ret < 0) { + goto cleanup; + } else if (ret > 0) { + ret = -ENETUNREACH; + goto cleanup; + } + + if (!attr->timeout) + attr->timeout = NET_LB_TIMEOUT; + + wait_for_completion_timeout(&tpriv->comp, attr->timeout); + ret = tpriv->ok ? 0 : -ETIMEDOUT; + +cleanup: + dev_remove_pack(&tpriv->pt); + kfree(tpriv); + return ret; +} + +static int net_test_netif_carrier(struct net_device *ndev) +{ + return netif_carrier_ok(ndev) ? 0 : -ENOLINK; +} + +static int net_test_phy_phydev(struct net_device *ndev) +{ + return ndev->phydev ? 0 : -EOPNOTSUPP; +} + +static int net_test_phy_loopback_enable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, true); +} + +static int net_test_phy_loopback_disable(struct net_device *ndev) +{ + if (!ndev->phydev) + return -EOPNOTSUPP; + + return phy_loopback(ndev->phydev, false); +} + +static int net_test_phy_loopback_udp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + return __net_test_loopback(ndev, &attr); +} + +static int net_test_phy_loopback_tcp(struct net_device *ndev) +{ + struct net_packet_attrs attr = { }; + + attr.dst = ndev->dev_addr; + attr.tcp = true; + return __net_test_loopback(ndev, &attr); +} + +static const struct net_test { + char name[ETH_GSTRING_LEN]; + int (*fn)(struct net_device *ndev); +} net_selftests[] = { + { + .name = "Carrier ", + .fn = net_test_netif_carrier, + }, { + .name = "PHY dev is present ", + .fn = net_test_phy_phydev, + }, { + /* This test should be done before all PHY loopback test */ + .name = "PHY internal loopback, enable ", + .fn = net_test_phy_loopback_enable, + }, { + .name = "PHY internal loopback, UDP ", + .fn = net_test_phy_loopback_udp, + }, { + .name = "PHY internal loopback, TCP ", + .fn = net_test_phy_loopback_tcp, + }, { + /* This test should be done after all PHY loopback test */ + .name = "PHY internal loopback, disable", + .fn = net_test_phy_loopback_disable, + }, +}; + +void net_selftest(struct net_device *ndev, struct ethtool_test *etest, u64 *buf) +{ + int count = net_selftest_get_count(); + int i; + + memset(buf, 0, sizeof(*buf) * count); + net_test_next_id = 0; + + if (etest->flags != ETH_TEST_FL_OFFLINE) { + netdev_err(ndev, "Only offline tests are supported\n"); + etest->flags |= ETH_TEST_FL_FAILED; + return; + } + + + for (i = 0; i < count; i++) { + buf[i] = net_selftests[i].fn(ndev); + if (buf[i] && (buf[i] != -EOPNOTSUPP)) + etest->flags |= ETH_TEST_FL_FAILED; + } +} +EXPORT_SYMBOL_GPL(net_selftest); + +int net_selftest_get_count(void) +{ + return ARRAY_SIZE(net_selftests); +} +EXPORT_SYMBOL_GPL(net_selftest_get_count); + +void net_selftest_get_strings(u8 *data) +{ + u8 *p = data; + int i; + + for (i = 0; i < net_selftest_get_count(); i++) { + snprintf(p, ETH_GSTRING_LEN, "%2d. %s", i + 1, + net_selftests[i].name); + p += ETH_GSTRING_LEN; + } +} +EXPORT_SYMBOL_GPL(net_selftest_get_strings); + +MODULE_LICENSE("GPL v2"); +MODULE_AUTHOR("Oleksij Rempel "); -- cgit v1.2.1 From eefb45eef5c4c425e87667af8f5e904fbdd47abf Mon Sep 17 00:00:00 2001 From: Chinmay Agarwal Date: Thu, 22 Apr 2021 01:12:22 +0530 Subject: neighbour: Prevent Race condition in neighbour subsytem Following Race Condition was detected: : Executing: __netif_receive_skb() ->__netif_receive_skb_core() -> arp_rcv() -> arp_process().arp_process() calls __neigh_lookup() which takes a reference on neighbour entry 'n'. Moves further along, arp_process() and calls neigh_update()-> __neigh_update(). Neighbour entry is unlocked just before a call to neigh_update_gc_list. This unlocking paves way for another thread that may take a reference on the same and mark it dead and remove it from gc_list. - neigh_flush_dev() is under execution and calls neigh_mark_dead(n) marking the neighbour entry 'n' as dead. Also n will be removed from gc_list. Moves further along neigh_flush_dev() and calls neigh_cleanup_and_release(n), but since reference count increased in t1, 'n' couldn't be destroyed. - Code hits neigh_update_gc_list, with neighbour entry set as dead. - arp_process() finally calls neigh_release(n), destroying the neighbour entry and we have a destroyed ntry still part of gc_list. Fixes: eb4e8fac00d1("neighbour: Prevent a dead entry from updating gc_list") Signed-off-by: Chinmay Agarwal Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/core') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 8379719d1dce..98f20efbfadf 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -131,6 +131,9 @@ static void neigh_update_gc_list(struct neighbour *n) write_lock_bh(&n->tbl->lock); write_lock(&n->lock); + if (n->dead) + goto out; + /* remove from the gc list if new state is permanent or if neighbor * is externally learned; otherwise entry should be on the gc list */ @@ -147,6 +150,7 @@ static void neigh_update_gc_list(struct neighbour *n) atomic_inc(&n->tbl->gc_entries); } +out: write_unlock(&n->lock); write_unlock_bh(&n->tbl->lock); } -- cgit v1.2.1 From 22b6034323fd736f260e00b9ea85c634abeb3446 Mon Sep 17 00:00:00 2001 From: Martin Willi Date: Mon, 19 Apr 2021 16:15:59 +0200 Subject: net, xdp: Update pkt_type if generic XDP changes unicast MAC MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If a generic XDP program changes the destination MAC address from/to multicast/broadcast, the skb->pkt_type is updated to properly handle the packet when passed up the stack. When changing the MAC from/to the NICs MAC, PACKET_HOST/OTHERHOST is not updated, though, making the behavior different from that of native XDP. Remember the PACKET_HOST/OTHERHOST state before calling the program in generic XDP, and update pkt_type accordingly if the destination MAC address has changed. As eth_type_trans() assumes a default pkt_type of PACKET_HOST, restore that before calling it. The use case for this is when a XDP program wants to push received packets up the stack by rewriting the MAC to the NICs MAC, for example by cluster nodes sharing MAC addresses. Fixes: 297249569932 ("net: fix generic XDP to handle if eth header was mangled") Signed-off-by: Martin Willi Signed-off-by: Daniel Borkmann Acked-by: Toke Høiland-Jørgensen Link: https://lore.kernel.org/bpf/20210419141559.8611-1-martin@strongswan.org --- net/core/dev.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/dev.c b/net/core/dev.c index b4c67a5be606..6a1ef7a15bed 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4723,10 +4723,10 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, void *orig_data, *orig_data_end, *hard_start; struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; + bool orig_bcast, orig_host; u32 mac_len, frame_sz; __be16 orig_eth_type; struct ethhdr *eth; - bool orig_bcast; int off; /* Reinjected packets coming from act_mirred or similar should @@ -4773,6 +4773,7 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, orig_data_end = xdp->data_end; orig_data = xdp->data; eth = (struct ethhdr *)xdp->data; + orig_host = ether_addr_equal_64bits(eth->h_dest, skb->dev->dev_addr); orig_bcast = is_multicast_ether_addr_64bits(eth->h_dest); orig_eth_type = eth->h_proto; @@ -4800,8 +4801,11 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, /* check if XDP changed eth hdr such SKB needs update */ eth = (struct ethhdr *)xdp->data; if ((orig_eth_type != eth->h_proto) || + (orig_host != ether_addr_equal_64bits(eth->h_dest, + skb->dev->dev_addr)) || (orig_bcast != is_multicast_ether_addr_64bits(eth->h_dest))) { __skb_push(skb, ETH_HLEN); + skb->pkt_type = PACKET_HOST; skb->protocol = eth_type_trans(skb, skb->dev); } -- cgit v1.2.1 From ed744d819379ddeec5744b0bfc7eb6d0a8ac4e46 Mon Sep 17 00:00:00 2001 From: Tonghao Zhang Date: Thu, 22 Apr 2021 21:41:51 +0800 Subject: net: sock: remove the unnecessary check in proto_register tw_prot_cleanup will check the twsk_prot. Fixes: 0f5907af3913 ("net: Fix potential memory leak in proto_register()") Cc: Miaohe Lin Signed-off-by: Tonghao Zhang Signed-off-by: David S. Miller --- net/core/sock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/sock.c b/net/core/sock.c index 5ec90f99e102..c761c4a0b66b 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -3531,7 +3531,7 @@ int proto_register(struct proto *prot, int alloc_slab) return ret; out_free_timewait_sock_slab: - if (alloc_slab && prot->twsk_prot) + if (alloc_slab) tw_prot_cleanup(prot->twsk_prot); out_free_request_sock_slab: if (alloc_slab) { -- cgit v1.2.1 From a1ab3e4554b5342b34845df452601ebd5a310d0a Mon Sep 17 00:00:00 2001 From: Parav Pandit Date: Wed, 10 Mar 2021 15:35:03 +0200 Subject: devlink: Extend SF port attributes to have external attribute Extended SF port attributes to have optional external flag similar to PCI PF and VF port attributes. External atttibute is required to generate unique phys_port_name when PF number and SF number are overlapping between two controllers similar to SR-IOV VFs. When a SF is for external controller an example view of external SF port and config sequence. On eswitch system: $ devlink dev eswitch set pci/0033:01:00.0 mode switchdev $ devlink port show pci/0033:01:00.0/196607: type eth netdev enP51p1s0f0np0 flavour physical port 0 splittable false pci/0033:01:00.0/131072: type eth netdev eth0 flavour pcipf controller 1 pfnum 0 external true splittable false function: hw_addr 00:00:00:00:00:00 $ devlink port add pci/0033:01:00.0 flavour pcisf pfnum 0 sfnum 77 controller 1 pci/0033:01:00.0/163840: type eth netdev eth1 flavour pcisf controller 1 pfnum 0 sfnum 77 splittable false function: hw_addr 00:00:00:00:00:00 state inactive opstate detached phys_port_name construction: $ cat /sys/class/net/eth1/phys_port_name c1pf0sf77 Signed-off-by: Parav Pandit Reviewed-by: Jiri Pirko Reviewed-by: Vu Pham Signed-off-by: Saeed Mahameed --- net/core/devlink.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/devlink.c b/net/core/devlink.c index 737b61c2976e..4eb969518ee0 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -8599,9 +8599,10 @@ EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set); * @controller: associated controller number for the devlink port instance * @pf: associated PF for the devlink port instance * @sf: associated SF of a PF for the devlink port instance + * @external: indicates if the port is for an external controller */ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 controller, - u16 pf, u32 sf) + u16 pf, u32 sf, bool external) { struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; @@ -8615,6 +8616,7 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro attrs->pci_sf.controller = controller; attrs->pci_sf.pf = pf; attrs->pci_sf.sf = sf; + attrs->pci_sf.external = external; } EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_sf_set); @@ -8667,6 +8669,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port, attrs->pci_vf.pf, attrs->pci_vf.vf); break; case DEVLINK_PORT_FLAVOUR_PCI_SF: + if (attrs->pci_sf.external) { + n = snprintf(name, len, "c%u", attrs->pci_sf.controller); + if (n >= len) + return -EINVAL; + len -= n; + name += n; + } n = snprintf(name, len, "pf%usf%u", attrs->pci_sf.pf, attrs->pci_sf.sf); break; -- cgit v1.2.1 From 4a52dd8fefb45626dace70a63c0738dbd83b7edb Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 28 Apr 2021 15:09:46 +0200 Subject: net: selftest: fix build issue if INET is disabled In case ethernet driver is enabled and INET is disabled, selftest will fail to build. Reported-by: Randy Dunlap Fixes: 3e1e58d64c3d ("net: add generic selftest support") Signed-off-by: Oleksij Rempel Acked-by: Randy Dunlap # build-tested Reviewed-by: Florian Fainelli Link: https://lore.kernel.org/r/20210428130947.29649-1-o.rempel@pengutronix.de Signed-off-by: Jakub Kicinski --- net/core/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/core') diff --git a/net/core/Makefile b/net/core/Makefile index 1a6168d8f23b..f7f16650fe9e 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -21,6 +21,7 @@ obj-$(CONFIG_NETPOLL) += netpoll.o obj-$(CONFIG_FIB_RULES) += fib_rules.o obj-$(CONFIG_TRACEPOINTS) += net-traces.o obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o +obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o @@ -33,7 +34,6 @@ obj-$(CONFIG_NET_DEVLINK) += devlink.o obj-$(CONFIG_GRO_CELLS) += gro_cells.o obj-$(CONFIG_FAILOVER) += failover.o ifeq ($(CONFIG_INET),y) -obj-$(CONFIG_NET_SELFTESTS) += selftests.o obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o endif -- cgit v1.2.1