From 1cd7884dfd78df6284d27b008823b0b4a808f196 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Apr 2018 13:42:15 -0400 Subject: udp: expose inet cork to udp UDP segmentation offload needs access to inet_cork in the udp layer. Pass the struct to ip(6)_make_skb instead of allocating it on the stack in that function itself. This patch is a noop otherwise. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 24b5c59b1c53..6b9d8017b319 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1030,9 +1030,11 @@ back_from_confirm: /* Lockless fast path for the non-corking case. */ if (!corkreq) { + struct inet_cork cork; + skb = ip_make_skb(sk, fl4, getfrag, msg, ulen, sizeof(struct udphdr), &ipc, &rt, - msg->msg_flags); + &cork, msg->msg_flags); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) err = udp_send_skb(skb, fl4); -- cgit v1.2.1 From bec1f6f697362c5bc635dacd7ac8499d0a10a4e7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Apr 2018 13:42:17 -0400 Subject: udp: generate gso with UDP_SEGMENT Support generic segmentation offload for udp datagrams. Callers can concatenate and send at once the payload of multiple datagrams with the same destination. To set segment size, the caller sets socket option UDP_SEGMENT to the length of each discrete payload. This value must be smaller than or equal to the relevant MTU. A follow-up patch adds cmsg UDP_SEGMENT to specify segment size on a per send call basis. Total byte length may then exceed MTU. If not an exact multiple of segment size, the last segment will be shorter. The implementation adds a gso_size field to the udp socket, ip(v6) cmsg cookie and inet_cork structure to be able to set the value at setsockopt or cmsg time and to work with both lockless and corked paths. Initial benchmark numbers show UDP GSO about as expensive as TCP GSO. tcp tso 3197 MB/s 54232 msg/s 54232 calls/s 6,457,754,262 cycles tcp gso 1765 MB/s 29939 msg/s 29939 calls/s 11,203,021,806 cycles tcp without tso/gso * 739 MB/s 12548 msg/s 12548 calls/s 11,205,483,630 cycles udp 876 MB/s 14873 msg/s 624666 calls/s 11,205,777,429 cycles udp gso 2139 MB/s 36282 msg/s 36282 calls/s 11,204,374,561 cycles [*] after reverting commit 0a6b2a1dc2a2 ("tcp: switch to GSO being always on") Measured total system cycles ('-a') for one core while pinning both the network receive path and benchmark process to that core: perf stat -a -C 12 -e cycles \ ./udpgso_bench_tx -C 12 -4 -D "$DST" -l 4 Note the reduction in calls/s with GSO. Bytes per syscall drops increases from 1470 to 61818. Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 6b9d8017b319..bda022c5480b 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -757,7 +757,8 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb, } EXPORT_SYMBOL(udp_set_csum); -static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) +static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, + struct inet_cork *cork) { struct sock *sk = skb->sk; struct inet_sock *inet = inet_sk(sk); @@ -777,6 +778,21 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4) uh->len = htons(len); uh->check = 0; + if (cork->gso_size) { + const int hlen = skb_network_header_len(skb) + + sizeof(struct udphdr); + + if (hlen + cork->gso_size > cork->fragsize) + return -EINVAL; + if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) + return -EINVAL; + if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) + return -EIO; + + skb_shinfo(skb)->gso_size = cork->gso_size; + skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + } + if (is_udplite) /* UDP-Lite */ csum = udplite_csum(skb); @@ -828,7 +844,7 @@ int udp_push_pending_frames(struct sock *sk) if (!skb) goto out; - err = udp_send_skb(skb, fl4); + err = udp_send_skb(skb, fl4, &inet->cork.base); out: up->len = 0; @@ -922,6 +938,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.sockc.tsflags = sk->sk_tsflags; ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; + ipc.gso_size = up->gso_size; if (msg->msg_controllen) { err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); @@ -1037,7 +1054,7 @@ back_from_confirm: &cork, msg->msg_flags); err = PTR_ERR(skb); if (!IS_ERR_OR_NULL(skb)) - err = udp_send_skb(skb, fl4); + err = udp_send_skb(skb, fl4, &cork); goto out; } @@ -2367,6 +2384,12 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, up->no_check6_rx = valbool; break; + case UDP_SEGMENT: + if (val < 0 || val > USHRT_MAX) + return -EINVAL; + up->gso_size = val; + break; + /* * UDP-Lite's partial checksum coverage (RFC 3828). */ @@ -2457,6 +2480,10 @@ int udp_lib_getsockopt(struct sock *sk, int level, int optname, val = up->no_check6_rx; break; + case UDP_SEGMENT: + val = up->gso_size; + break; + /* The following two cannot be changed on UDP sockets, the return is * always 0 (which corresponds to the full checksum coverage of UDP). */ case UDPLITE_SEND_CSCOV: -- cgit v1.2.1 From 2e8de8576343ab540856082916bfb84d17288b08 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 26 Apr 2018 13:42:20 -0400 Subject: udp: add gso segment cmsg Allow specifying segment size in the send call. The new control message performs the same function as socket option UDP_SEGMENT while avoiding the extra system call. [ Export udp_cmsg_send for ipv6. -DaveM ] Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 44 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 42 insertions(+), 2 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index bda022c5480b..794aeafeb782 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -853,6 +853,43 @@ out: } EXPORT_SYMBOL(udp_push_pending_frames); +static int __udp_cmsg_send(struct cmsghdr *cmsg, u16 *gso_size) +{ + switch (cmsg->cmsg_type) { + case UDP_SEGMENT: + if (cmsg->cmsg_len != CMSG_LEN(sizeof(__u16))) + return -EINVAL; + *gso_size = *(__u16 *)CMSG_DATA(cmsg); + return 0; + default: + return -EINVAL; + } +} + +int udp_cmsg_send(struct sock *sk, struct msghdr *msg, u16 *gso_size) +{ + struct cmsghdr *cmsg; + bool need_ip = false; + int err; + + for_each_cmsghdr(cmsg, msg) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_UDP) { + need_ip = true; + continue; + } + + err = __udp_cmsg_send(cmsg, gso_size); + if (err) + return err; + } + + return need_ip; +} +EXPORT_SYMBOL_GPL(udp_cmsg_send); + int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); @@ -941,8 +978,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) ipc.gso_size = up->gso_size; if (msg->msg_controllen) { - err = ip_cmsg_send(sk, msg, &ipc, sk->sk_family == AF_INET6); - if (unlikely(err)) { + err = udp_cmsg_send(sk, msg, &ipc.gso_size); + if (err > 0) + err = ip_cmsg_send(sk, msg, &ipc, + sk->sk_family == AF_INET6); + if (unlikely(err < 0)) { kfree(ipc.opt); return err; } -- cgit v1.2.1 From a8c744a8b43733509b5625e65fee1985fbb901d7 Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Mon, 30 Apr 2018 15:58:36 -0400 Subject: udp: disable gso with no_check_tx Syzbot managed to send a udp gso packet without checksum offload into the gso stack by disabling tx checksum (UDP_NO_CHECK6_TX). This triggered the skb_warn_bad_offload. RIP: 0010:skb_warn_bad_offload+0x2bc/0x600 net/core/dev.c:2658 skb_gso_segment include/linux/netdevice.h:4038 [inline] validate_xmit_skb+0x54d/0xd90 net/core/dev.c:3120 __dev_queue_xmit+0xbf8/0x34c0 net/core/dev.c:3577 dev_queue_xmit+0x17/0x20 net/core/dev.c:3618 UDP_NO_CHECK6_TX sets skb->ip_summed to CHECKSUM_NONE just after the udp gso integrity checks in udp_(v6_)send_skb. Extend those checks to catch and fail in this case. After the integrity checks jump directly to the CHECKSUM_PARTIAL case to avoid reading the no_check_tx flags again (a TOCTTOU race). Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 794aeafeb782..dd3102a37ef9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -786,11 +786,14 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, return -EINVAL; if (skb->len > cork->gso_size * UDP_MAX_SEGMENTS) return -EINVAL; + if (sk->sk_no_check_tx) + return -EINVAL; if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) return -EIO; skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + goto csum_partial; } if (is_udplite) /* UDP-Lite */ @@ -802,6 +805,7 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, goto send; } else if (skb->ip_summed == CHECKSUM_PARTIAL) { /* UDP hardware csum */ +csum_partial: udp4_hwcsum(skb, fl4->saddr, fl4->daddr); goto send; -- cgit v1.2.1 From dfec0ee22c0a4488e4f4c764b2720d3096920383 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Mon, 7 May 2018 11:08:22 -0700 Subject: udp: Record gso_segs when supporting UDP segmentation offload We need to record the number of segments that will be generated when this frame is segmented. The expectation is that if gso_size is set then gso_segs is set as well. Without this some drivers such as ixgbe get confused if they attempt to offload this as they record 0 segments for the entire packet instead of the correct value. Reviewed-by: Eric Dumazet Acked-by: Willem de Bruijn Signed-off-by: Alexander Duyck Signed-off-by: David S. Miller --- net/ipv4/udp.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index dd3102a37ef9..e07db83b311e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -793,6 +793,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, skb_shinfo(skb)->gso_size = cork->gso_size; skb_shinfo(skb)->gso_type = SKB_GSO_UDP_L4; + skb_shinfo(skb)->gso_segs = DIV_ROUND_UP(len - sizeof(uh), + cork->gso_size); goto csum_partial; } -- cgit v1.2.1 From 88ab31081b8c8d4d7698711b80fbfd3f52d27fd9 Mon Sep 17 00:00:00 2001 From: Davidlohr Bueso Date: Tue, 8 May 2018 09:07:03 -0700 Subject: net/udp: Update udp_encap_needed static key to modern api No changes in refcount semantics -- key init is false; replace static_key_enable with static_branch_enable static_key_slow_inc|dec with static_branch_inc|dec static_key_false with static_branch_unlikely Added a '_key' suffix to udp and udpv6 encap_needed, for better self documentation. Signed-off-by: Davidlohr Bueso Signed-off-by: David S. Miller --- net/ipv4/udp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index e07db83b311e..229e616fafc5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1875,10 +1875,10 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return 0; } -static struct static_key udp_encap_needed __read_mostly; +static DEFINE_STATIC_KEY_FALSE(udp_encap_needed_key); void udp_encap_enable(void) { - static_key_enable(&udp_encap_needed); + static_branch_enable(&udp_encap_needed_key); } EXPORT_SYMBOL(udp_encap_enable); @@ -1902,7 +1902,7 @@ static int udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) goto drop; nf_reset(skb); - if (static_key_false(&udp_encap_needed) && up->encap_type) { + if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); /* @@ -2365,7 +2365,7 @@ void udp_destroy_sock(struct sock *sk) bool slow = lock_sock_fast(sk); udp_flush_pending_frames(sk); unlock_sock_fast(sk, slow); - if (static_key_false(&udp_encap_needed) && up->encap_type) { + if (static_branch_unlikely(&udp_encap_needed_key) && up->encap_type) { void (*encap_destroy)(struct sock *sk); encap_destroy = READ_ONCE(up->encap_destroy); if (encap_destroy) -- cgit v1.2.1 From ff06342cbca0ec2c0d5da8685d2691e2d0e9764a Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Tue, 22 May 2018 11:34:39 -0400 Subject: udp: exclude gso from xfrm paths UDP GSO delays final datagram construction to the GSO layer. This conflicts with protocol transformations. Fixes: bec1f6f69736 ("udp: generate gso with UDP_SEGMENT") CC: Michal Kubecek Signed-off-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/ipv4/udp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index ff4d4ba67735..d71f1f3e1155 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -788,7 +788,8 @@ static int udp_send_skb(struct sk_buff *skb, struct flowi4 *fl4, return -EINVAL; if (sk->sk_no_check_tx) return -EINVAL; - if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite) + if (skb->ip_summed != CHECKSUM_PARTIAL || is_udplite || + dst_xfrm(skb_dst(skb))) return -EIO; skb_shinfo(skb)->gso_size = cork->gso_size; -- cgit v1.2.1 From 1cedee13d25ab118d325f95588c1a084e9317229 Mon Sep 17 00:00:00 2001 From: Andrey Ignatov Date: Fri, 25 May 2018 08:55:23 -0700 Subject: bpf: Hooks for sys_sendmsg In addition to already existing BPF hooks for sys_bind and sys_connect, the patch provides new hooks for sys_sendmsg. It leverages existing BPF program type `BPF_PROG_TYPE_CGROUP_SOCK_ADDR` that provides access to socket itlself (properties like family, type, protocol) and user-passed `struct sockaddr *` so that BPF program can override destination IP and port for system calls such as sendto(2) or sendmsg(2) and/or assign source IP to the socket. The hooks are implemented as two new attach types: `BPF_CGROUP_UDP4_SENDMSG` and `BPF_CGROUP_UDP6_SENDMSG` for UDPv4 and UDPv6 correspondingly. UDPv4 and UDPv6 separate attach types for same reason as sys_bind and sys_connect hooks, i.e. to prevent reading from / writing to e.g. user_ip6 fields when user passes sockaddr_in since it'd be out-of-bound. The difference with already existing hooks is sys_sendmsg are implemented only for unconnected UDP. For TCP it doesn't make sense to change user-provided `struct sockaddr *` at sendto(2)/sendmsg(2) time since socket either was already connected and has source/destination set or wasn't connected and call to sendto(2)/sendmsg(2) would lead to ENOTCONN anyway. Connected UDP is already handled by sys_connect hooks that can override source/destination at connect time and use fast-path later, i.e. these hooks don't affect UDP fast-path. Rewriting source IP is implemented differently than that in sys_connect hooks. When sys_sendmsg is used with unconnected UDP it doesn't work to just bind socket to desired local IP address since source IP can be set on per-packet basis by using ancillary data (cmsg(3)). So no matter if socket is bound or not, source IP has to be rewritten on every call to sys_sendmsg. To do so two new fields are added to UAPI `struct bpf_sock_addr`; * `msg_src_ip4` to set source IPv4 for UDPv4; * `msg_src_ip6` to set source IPv6 for UDPv6. Signed-off-by: Andrey Ignatov Acked-by: Alexei Starovoitov Acked-by: Martin KaFai Lau Signed-off-by: Daniel Borkmann --- net/ipv4/udp.c | 20 ++++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d71f1f3e1155..3c27d00b5730 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -901,6 +901,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); struct udp_sock *up = udp_sk(sk); + DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); struct flowi4 fl4_stack; struct flowi4 *fl4; int ulen = len; @@ -955,8 +956,7 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) /* * Get and verify the address. */ - if (msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_in *, usin, msg->msg_name); + if (usin) { if (msg->msg_namelen < sizeof(*usin)) return -EINVAL; if (usin->sin_family != AF_INET) { @@ -1010,6 +1010,22 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) rcu_read_unlock(); } + if (cgroup_bpf_enabled && !connected) { + err = BPF_CGROUP_RUN_PROG_UDP4_SENDMSG_LOCK(sk, + (struct sockaddr *)usin, &ipc.addr); + if (err) + goto out_free; + if (usin) { + if (usin->sin_port == 0) { + /* BPF program set invalid port. Reject it. */ + err = -EINVAL; + goto out_free; + } + daddr = usin->sin_addr.s_addr; + dport = usin->sin_port; + } + } + saddr = ipc.addr; ipc.addr = faddr = daddr; -- cgit v1.2.1 From 6e86000c2c63123e174b7e198735fbb12f0258ea Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 5 Jun 2018 13:40:34 +0200 Subject: netfilter: provide udp*_lib_lookup for nf_tproxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It is now possible to enable the libified nf_tproxy modules without also enabling NETFILTER_XT_TARGET_TPROXY, which throws off the ifdef logic in the udp core code: net/ipv6/netfilter/nf_tproxy_ipv6.o: In function `nf_tproxy_get_sock_v6': nf_tproxy_ipv6.c:(.text+0x1a8): undefined reference to `udp6_lib_lookup' net/ipv4/netfilter/nf_tproxy_ipv4.o: In function `nf_tproxy_get_sock_v4': nf_tproxy_ipv4.c:(.text+0x3d0): undefined reference to `udp4_lib_lookup' We can actually simplify the conditions now to provide the two functions exactly when they are needed. Fixes: 45ca4e0cf273 ("netfilter: Libify xt_TPROXY") Signed-off-by: Arnd Bergmann Acked-by: Paolo Abeni Acked-by: Máté Eckl Signed-off-by: David S. Miller --- net/ipv4/udp.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/ipv4/udp.c') diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index d71f1f3e1155..05af6f81e7e5 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -544,9 +544,7 @@ EXPORT_SYMBOL_GPL(udp4_lib_lookup_skb); /* Must be called under rcu_read_lock(). * Does increment socket refcount. */ -#if IS_ENABLED(CONFIG_NETFILTER_XT_MATCH_SOCKET) || \ - IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TPROXY) || \ - IS_ENABLED(CONFIG_NF_SOCKET_IPV4) +#if IS_ENABLED(CONFIG_NF_TPROXY_IPV4) || IS_ENABLED(CONFIG_NF_SOCKET_IPV4) struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport, __be32 daddr, __be16 dport, int dif) { -- cgit v1.2.1