summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/bpf_sk_storage.c12
-rw-r--r--net/core/dev.c20
-rw-r--r--net/core/devlink.c398
-rw-r--r--net/core/dst.c2
-rw-r--r--net/core/ethtool.c24
-rw-r--r--net/core/filter.c382
-rw-r--r--net/core/flow_dissector.c70
-rw-r--r--net/core/flow_offload.c128
-rw-r--r--net/core/hwbm.c15
-rw-r--r--net/core/link_watch.c13
-rw-r--r--net/core/neighbour.c2
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/net_namespace.c28
-rw-r--r--net/core/netpoll.c10
-rw-r--r--net/core/page_pool.c103
-rw-r--r--net/core/pktgen.c8
-rw-r--r--net/core/rtnetlink.c9
-rw-r--r--net/core/skbuff.c376
-rw-r--r--net/core/sock.c6
-rw-r--r--net/core/sock_map.c9
-rw-r--r--net/core/sock_reuseport.c24
-rw-r--r--net/core/xdp.c123
22 files changed, 1513 insertions, 253 deletions
diff --git a/net/core/bpf_sk_storage.c b/net/core/bpf_sk_storage.c
index d1c4e1f3be2c..94c7f77ecb6b 100644
--- a/net/core/bpf_sk_storage.c
+++ b/net/core/bpf_sk_storage.c
@@ -627,6 +627,7 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
unsigned int i;
u32 nbuckets;
u64 cost;
+ int ret;
smap = kzalloc(sizeof(*smap), GFP_USER | __GFP_NOWARN);
if (!smap)
@@ -636,13 +637,21 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
/* Use at least 2 buckets, select_bucket() is undefined behavior with 1 bucket */
smap->bucket_log = max_t(u32, 1, ilog2(roundup_pow_of_two(num_possible_cpus())));
nbuckets = 1U << smap->bucket_log;
+ cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
+
+ ret = bpf_map_charge_init(&smap->map.memory, cost);
+ if (ret < 0) {
+ kfree(smap);
+ return ERR_PTR(ret);
+ }
+
smap->buckets = kvcalloc(sizeof(*smap->buckets), nbuckets,
GFP_USER | __GFP_NOWARN);
if (!smap->buckets) {
+ bpf_map_charge_finish(&smap->map.memory);
kfree(smap);
return ERR_PTR(-ENOMEM);
}
- cost = sizeof(*smap->buckets) * nbuckets + sizeof(*smap);
for (i = 0; i < nbuckets; i++) {
INIT_HLIST_HEAD(&smap->buckets[i].list);
@@ -652,7 +661,6 @@ static struct bpf_map *bpf_sk_storage_map_alloc(union bpf_attr *attr)
smap->elem_size = sizeof(struct bpf_sk_storage_elem) + attr->value_size;
smap->cache_idx = (unsigned int)atomic_inc_return(&cache_idx) %
BPF_SK_STORAGE_CACHE_SIZE;
- smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
return &smap->map;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index d6edd218babd..fc676b2610e3 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -2900,12 +2900,10 @@ static void skb_warn_bad_offload(const struct sk_buff *skb)
else
name = netdev_name(dev);
}
- WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
- "gso_type=%d ip_summed=%d\n",
+ skb_dump(KERN_WARNING, skb, false);
+ WARN(1, "%s: caps=(%pNF, %pNF)\n",
name, dev ? &dev->features : &null_features,
- skb->sk ? &skb->sk->sk_route_caps : &null_features,
- skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
- skb_shinfo(skb)->gso_type, skb->ip_summed);
+ skb->sk ? &skb->sk->sk_route_caps : &null_features);
}
/*
@@ -3124,13 +3122,7 @@ void netdev_rx_csum_fault(struct net_device *dev, struct sk_buff *skb)
{
if (net_ratelimit()) {
pr_err("%s: hw csum failure\n", dev ? dev->name : "<unknown>");
- if (dev)
- pr_err("dev features: %pNF\n", &dev->features);
- pr_err("skb len=%u data_len=%u pkt_type=%u gso_size=%u gso_type=%u nr_frags=%u ip_summed=%u csum=%x csum_complete_sw=%d csum_valid=%d csum_level=%u\n",
- skb->len, skb->data_len, skb->pkt_type,
- skb_shinfo(skb)->gso_size, skb_shinfo(skb)->gso_type,
- skb_shinfo(skb)->nr_frags, skb->ip_summed, skb->csum,
- skb->csum_complete_sw, skb->csum_valid, skb->csum_level);
+ skb_dump(KERN_ERR, skb, true);
dump_stack();
}
}
@@ -4689,9 +4681,7 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
__skb_push(skb, skb->mac_len);
skb_do_redirect(skb);
return NULL;
- case TC_ACT_REINSERT:
- /* this does not scrub the packet, and updates stats on error */
- skb_tc_reinsert(skb, &cl_res);
+ case TC_ACT_CONSUMED:
return NULL;
default:
break;
diff --git a/net/core/devlink.c b/net/core/devlink.c
index 132f4b757963..4f40aeace902 100644
--- a/net/core/devlink.c
+++ b/net/core/devlink.c
@@ -17,6 +17,7 @@
#include <linux/netdevice.h>
#include <linux/spinlock.h>
#include <linux/refcount.h>
+#include <linux/workqueue.h>
#include <rdma/ib_verbs.h>
#include <net/netlink.h>
#include <net/genetlink.h>
@@ -514,14 +515,31 @@ static int devlink_nl_port_attrs_put(struct sk_buff *msg,
return 0;
if (nla_put_u16(msg, DEVLINK_ATTR_PORT_FLAVOUR, attrs->flavour))
return -EMSGSIZE;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER, attrs->port_number))
+ if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_PF) {
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+ attrs->pci_pf.pf))
+ return -EMSGSIZE;
+ } else if (devlink_port->attrs.flavour == DEVLINK_PORT_FLAVOUR_PCI_VF) {
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_PF_NUMBER,
+ attrs->pci_vf.pf) ||
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_PCI_VF_NUMBER,
+ attrs->pci_vf.vf))
+ return -EMSGSIZE;
+ }
+ if (devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_PHYSICAL &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA)
+ return 0;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_NUMBER,
+ attrs->phys.port_number))
return -EMSGSIZE;
if (!attrs->split)
return 0;
- if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP, attrs->port_number))
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+ attrs->phys.port_number))
return -EMSGSIZE;
if (nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_SUBPORT_NUMBER,
- attrs->split_subport_number))
+ attrs->phys.split_subport_number))
return -EMSGSIZE;
return 0;
}
@@ -1548,7 +1566,8 @@ static int devlink_nl_eswitch_fill(struct sk_buff *msg, struct devlink *devlink,
u32 seq, int flags)
{
const struct devlink_ops *ops = devlink->ops;
- u8 inline_mode, encap_mode;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
void *hdr;
int err = 0;
u16 mode;
@@ -1624,7 +1643,8 @@ static int devlink_nl_cmd_eswitch_set_doit(struct sk_buff *skb,
{
struct devlink *devlink = info->user_ptr[0];
const struct devlink_ops *ops = devlink->ops;
- u8 inline_mode, encap_mode;
+ enum devlink_eswitch_encap_mode encap_mode;
+ u8 inline_mode;
int err = 0;
u16 mode;
@@ -2668,6 +2688,108 @@ static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info)
return devlink->ops->reload(devlink, info->extack);
}
+static int devlink_nl_flash_update_fill(struct sk_buff *msg,
+ struct devlink *devlink,
+ enum devlink_command cmd,
+ const char *status_msg,
+ const char *component,
+ unsigned long done, unsigned long total)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, 0, 0, &devlink_nl_family, 0, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ if (cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS)
+ goto out;
+
+ if (status_msg &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_MSG,
+ status_msg))
+ goto nla_put_failure;
+ if (component &&
+ nla_put_string(msg, DEVLINK_ATTR_FLASH_UPDATE_COMPONENT,
+ component))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_DONE,
+ done, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+ if (nla_put_u64_64bit(msg, DEVLINK_ATTR_FLASH_UPDATE_STATUS_TOTAL,
+ total, DEVLINK_ATTR_PAD))
+ goto nla_put_failure;
+
+out:
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void __devlink_flash_update_notify(struct devlink *devlink,
+ enum devlink_command cmd,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_FLASH_UPDATE &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_END &&
+ cmd != DEVLINK_CMD_FLASH_UPDATE_STATUS);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_flash_update_fill(msg, devlink, cmd, status_msg,
+ component, done, total);
+ if (err)
+ goto out_free_msg;
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+ return;
+
+out_free_msg:
+ nlmsg_free(msg);
+}
+
+void devlink_flash_update_begin_notify(struct devlink *devlink)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE,
+ NULL, NULL, 0, 0);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_begin_notify);
+
+void devlink_flash_update_end_notify(struct devlink *devlink)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_END,
+ NULL, NULL, 0, 0);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_end_notify);
+
+void devlink_flash_update_status_notify(struct devlink *devlink,
+ const char *status_msg,
+ const char *component,
+ unsigned long done,
+ unsigned long total)
+{
+ __devlink_flash_update_notify(devlink,
+ DEVLINK_CMD_FLASH_UPDATE_STATUS,
+ status_msg, component, done, total);
+}
+EXPORT_SYMBOL_GPL(devlink_flash_update_status_notify);
+
static int devlink_nl_cmd_flash_update(struct sk_buff *skb,
struct genl_info *info)
{
@@ -4415,6 +4537,35 @@ nla_put_failure:
return err;
}
+static int devlink_fmsg_dumpit(struct devlink_fmsg *fmsg, struct sk_buff *skb,
+ struct netlink_callback *cb,
+ enum devlink_command cmd)
+{
+ int index = cb->args[0];
+ int tmp_index = index;
+ void *hdr;
+ int err;
+
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq,
+ &devlink_nl_family, NLM_F_ACK | NLM_F_MULTI, cmd);
+ if (!hdr) {
+ err = -EMSGSIZE;
+ goto nla_put_failure;
+ }
+
+ err = devlink_fmsg_prepare_skb(fmsg, skb, &index);
+ if ((err && err != -EMSGSIZE) || tmp_index == index)
+ goto nla_put_failure;
+
+ cb->args[0] = index;
+ genlmsg_end(skb, hdr);
+ return skb->len;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return err;
+}
+
struct devlink_health_reporter {
struct list_head list;
void *priv;
@@ -4647,17 +4798,16 @@ int devlink_health_report(struct devlink_health_reporter *reporter,
EXPORT_SYMBOL_GPL(devlink_health_report);
static struct devlink_health_reporter *
-devlink_health_reporter_get_from_info(struct devlink *devlink,
- struct genl_info *info)
+devlink_health_reporter_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
{
struct devlink_health_reporter *reporter;
char *reporter_name;
- if (!info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
+ if (!attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME])
return NULL;
- reporter_name =
- nla_data(info->attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
+ reporter_name = nla_data(attrs[DEVLINK_ATTR_HEALTH_REPORTER_NAME]);
mutex_lock(&devlink->reporters_lock);
reporter = devlink_health_reporter_find_by_name(devlink, reporter_name);
if (reporter)
@@ -4666,6 +4816,48 @@ devlink_health_reporter_get_from_info(struct devlink *devlink,
return reporter;
}
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_health_reporter_get_from_attrs(devlink, info->attrs);
+}
+
+static struct devlink_health_reporter *
+devlink_health_reporter_get_from_cb(struct netlink_callback *cb)
+{
+ struct devlink_health_reporter *reporter;
+ struct devlink *devlink;
+ struct nlattr **attrs;
+ int err;
+
+ attrs = kmalloc_array(DEVLINK_ATTR_MAX + 1, sizeof(*attrs), GFP_KERNEL);
+ if (!attrs)
+ return NULL;
+
+ err = nlmsg_parse_deprecated(cb->nlh,
+ GENL_HDRLEN + devlink_nl_family.hdrsize,
+ attrs, DEVLINK_ATTR_MAX,
+ devlink_nl_family.policy, cb->extack);
+ if (err)
+ goto free;
+
+ mutex_lock(&devlink_mutex);
+ devlink = devlink_get_from_attrs(sock_net(cb->skb->sk), attrs);
+ if (IS_ERR(devlink))
+ goto unlock;
+
+ reporter = devlink_health_reporter_get_from_attrs(devlink, attrs);
+ mutex_unlock(&devlink_mutex);
+ kfree(attrs);
+ return reporter;
+unlock:
+ mutex_unlock(&devlink_mutex);
+free:
+ kfree(attrs);
+ return NULL;
+}
+
static void
devlink_health_reporter_put(struct devlink_health_reporter *reporter)
{
@@ -4901,32 +5093,40 @@ out:
return err;
}
-static int devlink_nl_cmd_health_reporter_dump_get_doit(struct sk_buff *skb,
- struct genl_info *info)
+static int
+devlink_nl_cmd_health_reporter_dump_get_dumpit(struct sk_buff *skb,
+ struct netlink_callback *cb)
{
- struct devlink *devlink = info->user_ptr[0];
struct devlink_health_reporter *reporter;
+ u64 start = cb->args[0];
int err;
- reporter = devlink_health_reporter_get_from_info(devlink, info);
+ reporter = devlink_health_reporter_get_from_cb(cb);
if (!reporter)
return -EINVAL;
if (!reporter->ops->dump) {
- devlink_health_reporter_put(reporter);
- return -EOPNOTSUPP;
+ err = -EOPNOTSUPP;
+ goto out;
}
-
mutex_lock(&reporter->dump_lock);
- err = devlink_health_do_dump(reporter, NULL);
- if (err)
- goto out;
-
- err = devlink_fmsg_snd(reporter->dump_fmsg, info,
- DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET, 0);
+ if (!start) {
+ err = devlink_health_do_dump(reporter, NULL);
+ if (err)
+ goto unlock;
+ cb->args[1] = reporter->dump_ts;
+ }
+ if (!reporter->dump_fmsg || cb->args[1] != reporter->dump_ts) {
+ NL_SET_ERR_MSG_MOD(cb->extack, "Dump trampled, please retry");
+ err = -EAGAIN;
+ goto unlock;
+ }
-out:
+ err = devlink_fmsg_dumpit(reporter->dump_fmsg, skb, cb,
+ DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET);
+unlock:
mutex_unlock(&reporter->dump_lock);
+out:
devlink_health_reporter_put(reporter);
return err;
}
@@ -5263,7 +5463,7 @@ static const struct genl_ops devlink_nl_ops[] = {
{
.cmd = DEVLINK_CMD_HEALTH_REPORTER_DUMP_GET,
.validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP,
- .doit = devlink_nl_cmd_health_reporter_dump_get_doit,
+ .dumpit = devlink_nl_cmd_health_reporter_dump_get_dumpit,
.flags = GENL_ADMIN_PERM,
.internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK |
DEVLINK_NL_FLAG_NO_LOCK,
@@ -5386,6 +5586,38 @@ void devlink_free(struct devlink *devlink)
}
EXPORT_SYMBOL_GPL(devlink_free);
+static void devlink_port_type_warn(struct work_struct *work)
+{
+ WARN(true, "Type was not set for devlink port.");
+}
+
+static bool devlink_port_type_should_warn(struct devlink_port *devlink_port)
+{
+ /* Ignore CPU and DSA flavours. */
+ return devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_CPU &&
+ devlink_port->attrs.flavour != DEVLINK_PORT_FLAVOUR_DSA;
+}
+
+#define DEVLINK_PORT_TYPE_WARN_TIMEOUT (HZ * 30)
+
+static void devlink_port_type_warn_schedule(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ /* Schedule a work to WARN in case driver does not set port
+ * type within timeout.
+ */
+ schedule_delayed_work(&devlink_port->type_warn_dw,
+ DEVLINK_PORT_TYPE_WARN_TIMEOUT);
+}
+
+static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port)
+{
+ if (!devlink_port_type_should_warn(devlink_port))
+ return;
+ cancel_delayed_work_sync(&devlink_port->type_warn_dw);
+}
+
/**
* devlink_port_register - Register devlink port
*
@@ -5415,6 +5647,8 @@ int devlink_port_register(struct devlink *devlink,
list_add_tail(&devlink_port->list, &devlink->port_list);
INIT_LIST_HEAD(&devlink_port->param_list);
mutex_unlock(&devlink->lock);
+ INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn);
+ devlink_port_type_warn_schedule(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
return 0;
}
@@ -5429,6 +5663,7 @@ void devlink_port_unregister(struct devlink_port *devlink_port)
{
struct devlink *devlink = devlink_port->devlink;
+ devlink_port_type_warn_cancel(devlink_port);
devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
mutex_lock(&devlink->lock);
list_del(&devlink_port->list);
@@ -5442,6 +5677,7 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port,
{
if (WARN_ON(!devlink_port->registered))
return;
+ devlink_port_type_warn_cancel(devlink_port);
spin_lock(&devlink_port->type_lock);
devlink_port->type = type;
devlink_port->type_dev = type_dev;
@@ -5515,9 +5751,33 @@ EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
void devlink_port_type_clear(struct devlink_port *devlink_port)
{
__devlink_port_type_set(devlink_port, DEVLINK_PORT_TYPE_NOTSET, NULL);
+ devlink_port_type_warn_schedule(devlink_port);
}
EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+static int __devlink_port_attrs_set(struct devlink_port *devlink_port,
+ enum devlink_port_flavour flavour,
+ const unsigned char *switch_id,
+ unsigned char switch_id_len)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+
+ if (WARN_ON(devlink_port->registered))
+ return -EEXIST;
+ attrs->set = true;
+ attrs->flavour = flavour;
+ if (switch_id) {
+ attrs->switch_port = true;
+ if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN))
+ switch_id_len = MAX_PHYS_ITEM_ID_LEN;
+ memcpy(attrs->switch_id.id, switch_id, switch_id_len);
+ attrs->switch_id.id_len = switch_id_len;
+ } else {
+ attrs->switch_port = false;
+ }
+ return 0;
+}
+
/**
* devlink_port_attrs_set - Set port attributes
*
@@ -5540,26 +5800,72 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port,
unsigned char switch_id_len)
{
struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int ret;
- if (WARN_ON(devlink_port->registered))
+ ret = __devlink_port_attrs_set(devlink_port, flavour,
+ switch_id, switch_id_len);
+ if (ret)
return;
- attrs->set = true;
- attrs->flavour = flavour;
- attrs->port_number = port_number;
attrs->split = split;
- attrs->split_subport_number = split_subport_number;
- if (switch_id) {
- attrs->switch_port = true;
- if (WARN_ON(switch_id_len > MAX_PHYS_ITEM_ID_LEN))
- switch_id_len = MAX_PHYS_ITEM_ID_LEN;
- memcpy(attrs->switch_id.id, switch_id, switch_id_len);
- attrs->switch_id.id_len = switch_id_len;
- } else {
- attrs->switch_port = false;
- }
+ attrs->phys.port_number = port_number;
+ attrs->phys.split_subport_number = split_subport_number;
}
EXPORT_SYMBOL_GPL(devlink_port_attrs_set);
+/**
+ * devlink_port_attrs_pci_pf_set - Set PCI PF port attributes
+ *
+ * @devlink_port: devlink port
+ * @pf: associated PF for the devlink port instance
+ * @switch_id: if the port is part of switch, this is buffer with ID,
+ * otherwise this is NULL
+ * @switch_id_len: length of the switch_id buffer
+ */
+void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port,
+ const unsigned char *switch_id,
+ unsigned char switch_id_len, u16 pf)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int ret;
+
+ ret = __devlink_port_attrs_set(devlink_port,
+ DEVLINK_PORT_FLAVOUR_PCI_PF,
+ switch_id, switch_id_len);
+ if (ret)
+ return;
+
+ attrs->pci_pf.pf = pf;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_pf_set);
+
+/**
+ * devlink_port_attrs_pci_vf_set - Set PCI VF port attributes
+ *
+ * @devlink_port: devlink port
+ * @pf: associated PF for the devlink port instance
+ * @vf: associated VF of a PF for the devlink port instance
+ * @switch_id: if the port is part of switch, this is buffer with ID,
+ * otherwise this is NULL
+ * @switch_id_len: length of the switch_id buffer
+ */
+void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port,
+ const unsigned char *switch_id,
+ unsigned char switch_id_len,
+ u16 pf, u16 vf)
+{
+ struct devlink_port_attrs *attrs = &devlink_port->attrs;
+ int ret;
+
+ ret = __devlink_port_attrs_set(devlink_port,
+ DEVLINK_PORT_FLAVOUR_PCI_VF,
+ switch_id, switch_id_len);
+ if (ret)
+ return;
+ attrs->pci_vf.pf = pf;
+ attrs->pci_vf.vf = vf;
+}
+EXPORT_SYMBOL_GPL(devlink_port_attrs_pci_vf_set);
+
static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
char *name, size_t len)
{
@@ -5572,10 +5878,11 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
switch (attrs->flavour) {
case DEVLINK_PORT_FLAVOUR_PHYSICAL:
if (!attrs->split)
- n = snprintf(name, len, "p%u", attrs->port_number);
+ n = snprintf(name, len, "p%u", attrs->phys.port_number);
else
- n = snprintf(name, len, "p%us%u", attrs->port_number,
- attrs->split_subport_number);
+ n = snprintf(name, len, "p%us%u",
+ attrs->phys.port_number,
+ attrs->phys.split_subport_number);
break;
case DEVLINK_PORT_FLAVOUR_CPU:
case DEVLINK_PORT_FLAVOUR_DSA:
@@ -5584,6 +5891,13 @@ static int __devlink_port_phys_port_name_get(struct devlink_port *devlink_port,
*/
WARN_ON(1);
return -EINVAL;
+ case DEVLINK_PORT_FLAVOUR_PCI_PF:
+ n = snprintf(name, len, "pf%u", attrs->pci_pf.pf);
+ break;
+ case DEVLINK_PORT_FLAVOUR_PCI_VF:
+ n = snprintf(name, len, "pf%uvf%u",
+ attrs->pci_vf.pf, attrs->pci_vf.vf);
+ break;
}
if (n >= len)
diff --git a/net/core/dst.c b/net/core/dst.c
index e46366228eaf..1325316d9eab 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -160,7 +160,7 @@ void dst_dev_put(struct dst_entry *dst)
dst->ops->ifdown(dst, dev, true);
dst->input = dst_discard;
dst->output = dst_discard_out;
- dst->dev = dev_net(dst->dev)->loopback_dev;
+ dst->dev = blackhole_netdev;
dev_hold(dst->dev);
dev_put(dev);
}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 4d1011b2e24f..6288e69e94fc 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -2883,6 +2883,30 @@ ethtool_rx_flow_rule_create(const struct ethtool_rx_flow_spec_input *input)
match->mask.basic.n_proto = htons(0xffff);
switch (fs->flow_type & ~(FLOW_EXT | FLOW_MAC_EXT | FLOW_RSS)) {
+ case ETHER_FLOW: {
+ const struct ethhdr *ether_spec, *ether_m_spec;
+
+ ether_spec = &fs->h_u.ether_spec;
+ ether_m_spec = &fs->m_u.ether_spec;
+
+ if (!is_zero_ether_addr(ether_m_spec->h_source)) {
+ ether_addr_copy(match->key.eth_addrs.src,
+ ether_spec->h_source);
+ ether_addr_copy(match->mask.eth_addrs.src,
+ ether_m_spec->h_source);
+ }
+ if (!is_zero_ether_addr(ether_m_spec->h_dest)) {
+ ether_addr_copy(match->key.eth_addrs.dst,
+ ether_spec->h_dest);
+ ether_addr_copy(match->mask.eth_addrs.dst,
+ ether_m_spec->h_dest);
+ }
+ if (ether_m_spec->h_proto) {
+ match->key.basic.n_proto = ether_spec->h_proto;
+ match->mask.basic.n_proto = ether_m_spec->h_proto;
+ }
+ }
+ break;
case TCP_V4_FLOW:
case UDP_V4_FLOW: {
const struct ethtool_tcpip4_spec *v4_spec, *v4_m_spec;
diff --git a/net/core/filter.c b/net/core/filter.c
index f615e42cf4ef..47f6386fb17a 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -62,6 +62,7 @@
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/ip_fib.h>
+#include <net/nexthop.h>
#include <net/flow.h>
#include <net/arp.h>
#include <net/ipv6.h>
@@ -2157,8 +2158,8 @@ BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
if (unlikely(flags & ~(BPF_F_INGRESS)))
return TC_ACT_SHOT;
- ri->ifindex = ifindex;
ri->flags = flags;
+ ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
@@ -2168,8 +2169,8 @@ int skb_do_redirect(struct sk_buff *skb)
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net_device *dev;
- dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->ifindex);
- ri->ifindex = 0;
+ dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index);
+ ri->tgt_index = 0;
if (unlikely(!dev)) {
kfree_skb(skb);
return -EINVAL;
@@ -3487,11 +3488,11 @@ xdp_do_redirect_slow(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_redirect_info *ri)
{
struct net_device *fwd;
- u32 index = ri->ifindex;
+ u32 index = ri->tgt_index;
int err;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
- ri->ifindex = 0;
+ ri->tgt_index = 0;
if (unlikely(!fwd)) {
err = -EINVAL;
goto err;
@@ -3522,7 +3523,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
err = dev_map_enqueue(dst, xdp, dev_rx);
if (unlikely(err))
return err;
- __dev_map_insert_ctx(map, index);
break;
}
case BPF_MAP_TYPE_CPUMAP: {
@@ -3531,7 +3531,6 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
err = cpu_map_enqueue(rcpu, xdp, dev_rx);
if (unlikely(err))
return err;
- __cpu_map_insert_ctx(map, index);
break;
}
case BPF_MAP_TYPE_XSKMAP: {
@@ -3605,18 +3604,14 @@ static int xdp_do_redirect_map(struct net_device *dev, struct xdp_buff *xdp,
struct bpf_prog *xdp_prog, struct bpf_map *map,
struct bpf_redirect_info *ri)
{
- u32 index = ri->ifindex;
- void *fwd = NULL;
+ u32 index = ri->tgt_index;
+ void *fwd = ri->tgt_value;
int err;
- ri->ifindex = 0;
+ ri->tgt_index = 0;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
- fwd = __xdp_map_lookup_elem(map, index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
if (ri->map_to_flush && unlikely(ri->map_to_flush != map))
xdp_do_flush_map();
@@ -3652,19 +3647,14 @@ static int xdp_do_generic_redirect_map(struct net_device *dev,
struct bpf_map *map)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- u32 index = ri->ifindex;
- void *fwd = NULL;
+ u32 index = ri->tgt_index;
+ void *fwd = ri->tgt_value;
int err = 0;
- ri->ifindex = 0;
+ ri->tgt_index = 0;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
- fwd = __xdp_map_lookup_elem(map, index);
- if (unlikely(!fwd)) {
- err = -EINVAL;
- goto err;
- }
-
if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
struct bpf_dtab_netdev *dst = fwd;
@@ -3696,14 +3686,14 @@ int xdp_do_generic_redirect(struct net_device *dev, struct sk_buff *skb,
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct bpf_map *map = READ_ONCE(ri->map);
- u32 index = ri->ifindex;
+ u32 index = ri->tgt_index;
struct net_device *fwd;
int err = 0;
if (map)
return xdp_do_generic_redirect_map(dev, skb, xdp, xdp_prog,
map);
- ri->ifindex = 0;
+ ri->tgt_index = 0;
fwd = dev_get_by_index_rcu(dev_net(dev), index);
if (unlikely(!fwd)) {
err = -EINVAL;
@@ -3731,8 +3721,9 @@ BPF_CALL_2(bpf_xdp_redirect, u32, ifindex, u64, flags)
if (unlikely(flags))
return XDP_ABORTED;
- ri->ifindex = ifindex;
ri->flags = flags;
+ ri->tgt_index = ifindex;
+ ri->tgt_value = NULL;
WRITE_ONCE(ri->map, NULL);
return XDP_REDIRECT;
@@ -3751,11 +3742,23 @@ BPF_CALL_3(bpf_xdp_redirect_map, struct bpf_map *, map, u32, ifindex,
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
- if (unlikely(flags))
+ /* Lower bits of the flags are used as return code on lookup failure */
+ if (unlikely(flags > XDP_TX))
return XDP_ABORTED;
- ri->ifindex = ifindex;
+ ri->tgt_value = __xdp_map_lookup_elem(map, ifindex);
+ if (unlikely(!ri->tgt_value)) {
+ /* If the lookup fails we want to clear out the state in the
+ * redirect_info struct completely, so that if an eBPF program
+ * performs multiple lookups, the last one always takes
+ * precedence.
+ */
+ WRITE_ONCE(ri->map, NULL);
+ return flags;
+ }
+
ri->flags = flags;
+ ri->tgt_index = ifindex;
WRITE_ONCE(ri->map, map);
return XDP_REDIRECT;
@@ -4670,7 +4673,7 @@ static int bpf_ipv4_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
if (res.type != RTN_UNICAST)
return BPF_FIB_LKUP_RET_NOT_FWDED;
- if (res.fi->fib_nhs > 1)
+ if (fib_info_num_path(res.fi) > 1)
fib_select_path(net, &res, &fl4, NULL);
if (check_mtu) {
@@ -4737,7 +4740,7 @@ static int bpf_ipv6_fib_lookup(struct net *net, struct bpf_fib_lookup *params,
return -ENODEV;
idev = __in6_dev_get_safely(dev);
- if (unlikely(!idev || !net->ipv6.devconf_all->forwarding))
+ if (unlikely(!idev || !idev->cnf.forwarding))
return BPF_FIB_LKUP_RET_FWD_DISABLED;
if (flags & BPF_FIB_LOOKUP_OUTPUT) {
@@ -5191,54 +5194,6 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = {
};
#endif /* CONFIG_IPV6_SEG6_BPF */
-#define CONVERT_COMMON_TCP_SOCK_FIELDS(md_type, CONVERT) \
-do { \
- switch (si->off) { \
- case offsetof(md_type, snd_cwnd): \
- CONVERT(snd_cwnd); break; \
- case offsetof(md_type, srtt_us): \
- CONVERT(srtt_us); break; \
- case offsetof(md_type, snd_ssthresh): \
- CONVERT(snd_ssthresh); break; \
- case offsetof(md_type, rcv_nxt): \
- CONVERT(rcv_nxt); break; \
- case offsetof(md_type, snd_nxt): \
- CONVERT(snd_nxt); break; \
- case offsetof(md_type, snd_una): \
- CONVERT(snd_una); break; \
- case offsetof(md_type, mss_cache): \
- CONVERT(mss_cache); break; \
- case offsetof(md_type, ecn_flags): \
- CONVERT(ecn_flags); break; \
- case offsetof(md_type, rate_delivered): \
- CONVERT(rate_delivered); break; \
- case offsetof(md_type, rate_interval_us): \
- CONVERT(rate_interval_us); break; \
- case offsetof(md_type, packets_out): \
- CONVERT(packets_out); break; \
- case offsetof(md_type, retrans_out): \
- CONVERT(retrans_out); break; \
- case offsetof(md_type, total_retrans): \
- CONVERT(total_retrans); break; \
- case offsetof(md_type, segs_in): \
- CONVERT(segs_in); break; \
- case offsetof(md_type, data_segs_in): \
- CONVERT(data_segs_in); break; \
- case offsetof(md_type, segs_out): \
- CONVERT(segs_out); break; \
- case offsetof(md_type, data_segs_out): \
- CONVERT(data_segs_out); break; \
- case offsetof(md_type, lost_out): \
- CONVERT(lost_out); break; \
- case offsetof(md_type, sacked_out): \
- CONVERT(sacked_out); break; \
- case offsetof(md_type, bytes_received): \
- CONVERT(bytes_received); break; \
- case offsetof(md_type, bytes_acked): \
- CONVERT(bytes_acked); break; \
- } \
-} while (0)
-
#ifdef CONFIG_INET
static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple,
int dif, int sdif, u8 family, u8 proto)
@@ -5589,7 +5544,8 @@ static const struct bpf_func_proto bpf_sock_addr_sk_lookup_udp_proto = {
bool bpf_tcp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
struct bpf_insn_access_aux *info)
{
- if (off < 0 || off >= offsetofend(struct bpf_tcp_sock, bytes_acked))
+ if (off < 0 || off >= offsetofend(struct bpf_tcp_sock,
+ icsk_retransmits))
return false;
if (off % size != 0)
@@ -5620,8 +5576,19 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, FIELD)); \
} while (0)
- CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_tcp_sock,
- BPF_TCP_SOCK_GET_COMMON);
+#define BPF_INET_SOCK_GET_COMMON(FIELD) \
+ do { \
+ BUILD_BUG_ON(FIELD_SIZEOF(struct inet_connection_sock, \
+ FIELD) > \
+ FIELD_SIZEOF(struct bpf_tcp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \
+ struct inet_connection_sock, \
+ FIELD), \
+ si->dst_reg, si->src_reg, \
+ offsetof( \
+ struct inet_connection_sock, \
+ FIELD)); \
+ } while (0)
if (insn > insn_buf)
return insn - insn_buf;
@@ -5637,6 +5604,81 @@ u32 bpf_tcp_sock_convert_ctx_access(enum bpf_access_type type,
offsetof(struct tcp_sock, rtt_min) +
offsetof(struct minmax_sample, v));
break;
+ case offsetof(struct bpf_tcp_sock, snd_cwnd):
+ BPF_TCP_SOCK_GET_COMMON(snd_cwnd);
+ break;
+ case offsetof(struct bpf_tcp_sock, srtt_us):
+ BPF_TCP_SOCK_GET_COMMON(srtt_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_ssthresh):
+ BPF_TCP_SOCK_GET_COMMON(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_tcp_sock, rcv_nxt):
+ BPF_TCP_SOCK_GET_COMMON(rcv_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_nxt):
+ BPF_TCP_SOCK_GET_COMMON(snd_nxt);
+ break;
+ case offsetof(struct bpf_tcp_sock, snd_una):
+ BPF_TCP_SOCK_GET_COMMON(snd_una);
+ break;
+ case offsetof(struct bpf_tcp_sock, mss_cache):
+ BPF_TCP_SOCK_GET_COMMON(mss_cache);
+ break;
+ case offsetof(struct bpf_tcp_sock, ecn_flags):
+ BPF_TCP_SOCK_GET_COMMON(ecn_flags);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_delivered):
+ BPF_TCP_SOCK_GET_COMMON(rate_delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, rate_interval_us):
+ BPF_TCP_SOCK_GET_COMMON(rate_interval_us);
+ break;
+ case offsetof(struct bpf_tcp_sock, packets_out):
+ BPF_TCP_SOCK_GET_COMMON(packets_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, retrans_out):
+ BPF_TCP_SOCK_GET_COMMON(retrans_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, total_retrans):
+ BPF_TCP_SOCK_GET_COMMON(total_retrans);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_in):
+ BPF_TCP_SOCK_GET_COMMON(segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_in):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_in);
+ break;
+ case offsetof(struct bpf_tcp_sock, segs_out):
+ BPF_TCP_SOCK_GET_COMMON(segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, data_segs_out):
+ BPF_TCP_SOCK_GET_COMMON(data_segs_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, lost_out):
+ BPF_TCP_SOCK_GET_COMMON(lost_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, sacked_out):
+ BPF_TCP_SOCK_GET_COMMON(sacked_out);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_received):
+ BPF_TCP_SOCK_GET_COMMON(bytes_received);
+ break;
+ case offsetof(struct bpf_tcp_sock, bytes_acked):
+ BPF_TCP_SOCK_GET_COMMON(bytes_acked);
+ break;
+ case offsetof(struct bpf_tcp_sock, dsack_dups):
+ BPF_TCP_SOCK_GET_COMMON(dsack_dups);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered):
+ BPF_TCP_SOCK_GET_COMMON(delivered);
+ break;
+ case offsetof(struct bpf_tcp_sock, delivered_ce):
+ BPF_TCP_SOCK_GET_COMMON(delivered_ce);
+ break;
+ case offsetof(struct bpf_tcp_sock, icsk_retransmits):
+ BPF_INET_SOCK_GET_COMMON(icsk_retransmits);
+ break;
}
return insn - insn_buf;
@@ -5650,7 +5692,7 @@ BPF_CALL_1(bpf_tcp_sock, struct sock *, sk)
return (unsigned long)NULL;
}
-static const struct bpf_func_proto bpf_tcp_sock_proto = {
+const struct bpf_func_proto bpf_tcp_sock_proto = {
.func = bpf_tcp_sock,
.gpl_only = false,
.ret_type = RET_PTR_TO_TCP_SOCK_OR_NULL,
@@ -5694,6 +5736,46 @@ BPF_CALL_1(bpf_skb_ecn_set_ce, struct sk_buff *, skb)
return INET_ECN_set_ce(skb);
}
+bool bpf_xdp_sock_is_valid_access(int off, int size, enum bpf_access_type type,
+ struct bpf_insn_access_aux *info)
+{
+ if (off < 0 || off >= offsetofend(struct bpf_xdp_sock, queue_id))
+ return false;
+
+ if (off % size != 0)
+ return false;
+
+ switch (off) {
+ default:
+ return size == sizeof(__u32);
+ }
+}
+
+u32 bpf_xdp_sock_convert_ctx_access(enum bpf_access_type type,
+ const struct bpf_insn *si,
+ struct bpf_insn *insn_buf,
+ struct bpf_prog *prog, u32 *target_size)
+{
+ struct bpf_insn *insn = insn_buf;
+
+#define BPF_XDP_SOCK_GET(FIELD) \
+ do { \
+ BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_sock, FIELD) > \
+ FIELD_SIZEOF(struct bpf_xdp_sock, FIELD)); \
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_sock, FIELD),\
+ si->dst_reg, si->src_reg, \
+ offsetof(struct xdp_sock, FIELD)); \
+ } while (0)
+
+ switch (si->off) {
+ case offsetof(struct bpf_xdp_sock, queue_id):
+ BPF_XDP_SOCK_GET(queue_id);
+ break;
+ }
+
+ return insn - insn_buf;
+}
+
static const struct bpf_func_proto bpf_skb_ecn_set_ce_proto = {
.func = bpf_skb_ecn_set_ce,
.gpl_only = false,
@@ -5896,6 +5978,10 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
case BPF_FUNC_skc_lookup_tcp:
return &bpf_sock_addr_skc_lookup_tcp_proto;
#endif /* CONFIG_INET */
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
default:
return bpf_base_func_proto(func_id);
}
@@ -5933,6 +6019,10 @@ cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_sk_storage_get_proto;
case BPF_FUNC_sk_storage_delete:
return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_SOCK_CGROUP_DATA
+ case BPF_FUNC_skb_cgroup_id:
+ return &bpf_skb_cgroup_id_proto;
+#endif
#ifdef CONFIG_INET
case BPF_FUNC_tcp_sock:
return &bpf_tcp_sock_proto;
@@ -6113,6 +6203,14 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_get_local_storage_proto;
case BPF_FUNC_perf_event_output:
return &bpf_sockopt_event_output_proto;
+ case BPF_FUNC_sk_storage_get:
+ return &bpf_sk_storage_get_proto;
+ case BPF_FUNC_sk_storage_delete:
+ return &bpf_sk_storage_delete_proto;
+#ifdef CONFIG_INET
+ case BPF_FUNC_tcp_sock:
+ return &bpf_tcp_sock_proto;
+#endif /* CONFIG_INET */
default:
return bpf_base_func_proto(func_id);
}
@@ -6792,6 +6890,16 @@ static bool sock_addr_is_valid_access(int off, int size,
if (!bpf_ctx_narrow_access_ok(off, size, size_default))
return false;
} else {
+ if (bpf_ctx_wide_store_ok(off, size,
+ struct bpf_sock_addr,
+ user_ip6))
+ return true;
+
+ if (bpf_ctx_wide_store_ok(off, size,
+ struct bpf_sock_addr,
+ msg_src_ip6))
+ return true;
+
if (size != size_default)
return false;
}
@@ -6800,6 +6908,13 @@ static bool sock_addr_is_valid_access(int off, int size,
if (size != size_default)
return false;
break;
+ case offsetof(struct bpf_sock_addr, sk):
+ if (type != BPF_READ)
+ return false;
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET;
+ break;
default:
if (type == BPF_READ) {
if (size != size_default)
@@ -6843,6 +6958,11 @@ static bool sock_ops_is_valid_access(int off, int size,
if (size != sizeof(__u64))
return false;
break;
+ case offsetof(struct bpf_sock_ops, sk):
+ if (size != sizeof(__u64))
+ return false;
+ info->reg_type = PTR_TO_SOCKET_OR_NULL;
+ break;
default:
if (size != size_default)
return false;
@@ -7620,9 +7740,6 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
/* SOCK_ADDR_STORE_NESTED_FIELD_OFF() has semantic similar to
* SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF() but for store operation.
*
- * It doesn't support SIZE argument though since narrow stores are not
- * supported for now.
- *
* In addition it uses Temporary Field TF (member of struct S) as the 3rd
* "register" since two registers available in convert_ctx_access are not
* enough: we can't override neither SRC, since it contains value to store, nor
@@ -7630,7 +7747,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
* instructions. But we need a temporary place to save pointer to nested
* structure whose field we want to store to.
*/
-#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, TF) \
+#define SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, OFF, TF) \
do { \
int tmp_reg = BPF_REG_9; \
if (si->src_reg == tmp_reg || si->dst_reg == tmp_reg) \
@@ -7641,8 +7758,7 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
offsetof(S, TF)); \
*insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(S, F), tmp_reg, \
si->dst_reg, offsetof(S, F)); \
- *insn++ = BPF_STX_MEM( \
- BPF_FIELD_SIZEOF(NS, NF), tmp_reg, si->src_reg, \
+ *insn++ = BPF_STX_MEM(SIZE, tmp_reg, si->src_reg, \
bpf_target_off(NS, NF, FIELD_SIZEOF(NS, NF), \
target_size) \
+ OFF); \
@@ -7654,8 +7770,8 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type,
TF) \
do { \
if (type == BPF_WRITE) { \
- SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, OFF, \
- TF); \
+ SOCK_ADDR_STORE_NESTED_FIELD_OFF(S, NS, F, NF, SIZE, \
+ OFF, TF); \
} else { \
SOCK_ADDR_LOAD_NESTED_FIELD_SIZE_OFF( \
S, NS, F, NF, SIZE, OFF); \
@@ -7750,6 +7866,11 @@ static u32 sock_addr_convert_ctx_access(enum bpf_access_type type,
struct bpf_sock_addr_kern, struct in6_addr, t_ctx,
s6_addr32[0], BPF_SIZE(si->code), off, tmp_reg);
break;
+ case offsetof(struct bpf_sock_addr, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct bpf_sock_addr_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_addr_kern, sk));
+ break;
}
return insn - insn_buf;
@@ -7837,9 +7958,6 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \
} while (0)
- CONVERT_COMMON_TCP_SOCK_FIELDS(struct bpf_sock_ops,
- SOCK_OPS_GET_TCP_SOCK_FIELD);
-
if (insn > insn_buf)
return insn - insn_buf;
@@ -8009,6 +8127,82 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type,
SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash,
struct sock, type);
break;
+ case offsetof(struct bpf_sock_ops, snd_cwnd):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_cwnd);
+ break;
+ case offsetof(struct bpf_sock_ops, srtt_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(srtt_us);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_ssthresh):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_ssthresh);
+ break;
+ case offsetof(struct bpf_sock_ops, rcv_nxt):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rcv_nxt);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_nxt):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_nxt);
+ break;
+ case offsetof(struct bpf_sock_ops, snd_una):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(snd_una);
+ break;
+ case offsetof(struct bpf_sock_ops, mss_cache):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(mss_cache);
+ break;
+ case offsetof(struct bpf_sock_ops, ecn_flags):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(ecn_flags);
+ break;
+ case offsetof(struct bpf_sock_ops, rate_delivered):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_delivered);
+ break;
+ case offsetof(struct bpf_sock_ops, rate_interval_us):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(rate_interval_us);
+ break;
+ case offsetof(struct bpf_sock_ops, packets_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(packets_out);
+ break;
+ case offsetof(struct bpf_sock_ops, retrans_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(retrans_out);
+ break;
+ case offsetof(struct bpf_sock_ops, total_retrans):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(total_retrans);
+ break;
+ case offsetof(struct bpf_sock_ops, segs_in):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_in);
+ break;
+ case offsetof(struct bpf_sock_ops, data_segs_in):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_in);
+ break;
+ case offsetof(struct bpf_sock_ops, segs_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(segs_out);
+ break;
+ case offsetof(struct bpf_sock_ops, data_segs_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(data_segs_out);
+ break;
+ case offsetof(struct bpf_sock_ops, lost_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(lost_out);
+ break;
+ case offsetof(struct bpf_sock_ops, sacked_out):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(sacked_out);
+ break;
+ case offsetof(struct bpf_sock_ops, bytes_received):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_received);
+ break;
+ case offsetof(struct bpf_sock_ops, bytes_acked):
+ SOCK_OPS_GET_TCP_SOCK_FIELD(bytes_acked);
+ break;
+ case offsetof(struct bpf_sock_ops, sk):
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern,
+ is_fullsock),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern,
+ is_fullsock));
+ *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 1);
+ *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(
+ struct bpf_sock_ops_kern, sk),
+ si->dst_reg, si->src_reg,
+ offsetof(struct bpf_sock_ops_kern, sk));
+ break;
}
return insn - insn_buf;
}
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index edd622956083..3e6fedb57bc1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -27,6 +27,10 @@
#include <scsi/fc/fc_fcoe.h>
#include <uapi/linux/batadv_packet.h>
#include <linux/bpf.h>
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+#include <net/netfilter/nf_conntrack_core.h>
+#include <net/netfilter/nf_conntrack_labels.h>
+#endif
static DEFINE_MUTEX(flow_dissector_mutex);
@@ -199,6 +203,22 @@ __be32 __skb_flow_get_ports(const struct sk_buff *skb, int thoff, u8 ip_proto,
}
EXPORT_SYMBOL(__skb_flow_get_ports);
+void skb_flow_dissect_meta(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container)
+{
+ struct flow_dissector_key_meta *meta;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_META))
+ return;
+
+ meta = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_META,
+ target_container);
+ meta->ingress_ifindex = skb->skb_iif;
+}
+EXPORT_SYMBOL(skb_flow_dissect_meta);
+
static void
skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
struct flow_dissector *flow_dissector,
@@ -216,6 +236,46 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type,
}
void
+skb_flow_dissect_ct(const struct sk_buff *skb,
+ struct flow_dissector *flow_dissector,
+ void *target_container,
+ u16 *ctinfo_map,
+ size_t mapsize)
+{
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
+ struct flow_dissector_key_ct *key;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn_labels *cl;
+ struct nf_conn *ct;
+
+ if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_CT))
+ return;
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (!ct)
+ return;
+
+ key = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_CT,
+ target_container);
+
+ if (ctinfo < mapsize)
+ key->ct_state = ctinfo_map[ctinfo];
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_ZONES)
+ key->ct_zone = ct->zone.id;
+#endif
+#if IS_ENABLED(CONFIG_NF_CONNTRACK_MARK)
+ key->ct_mark = ct->mark;
+#endif
+
+ cl = nf_ct_labels_find(ct);
+ if (cl)
+ memcpy(key->ct_labels, cl->bits, sizeof(key->ct_labels));
+#endif /* CONFIG_NF_CONNTRACK */
+}
+EXPORT_SYMBOL(skb_flow_dissect_ct);
+
+void
skb_flow_dissect_tunnel_info(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container)
@@ -757,7 +817,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx,
* @nhoff: network header offset, if @data is NULL use skb_network_offset(skb)
* @hlen: packet header length, if @data is NULL use skb_headlen(skb)
* @flags: flags that control the dissection process, e.g.
- * FLOW_DISSECTOR_F_STOP_AT_L3.
+ * FLOW_DISSECTOR_F_STOP_AT_ENCAP.
*
* The function will try to retrieve individual keys into target specified
* by flow_dissector from either the skbuff or a raw buffer specified by the
@@ -922,11 +982,6 @@ proto_again:
__skb_flow_dissect_ipv4(skb, flow_dissector,
target_container, data, iph);
- if (flags & FLOW_DISSECTOR_F_STOP_AT_L3) {
- fdret = FLOW_DISSECT_RET_OUT_GOOD;
- break;
- }
-
break;
}
case htons(ETH_P_IPV6): {
@@ -975,9 +1030,6 @@ proto_again:
__skb_flow_dissect_ipv6(skb, flow_dissector,
target_container, data, iph);
- if (flags & FLOW_DISSECTOR_F_STOP_AT_L3)
- fdret = FLOW_DISSECT_RET_OUT_GOOD;
-
break;
}
case htons(ETH_P_8021AD):
diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c
index 5ce7d47a960e..76f8db3841d7 100644
--- a/net/core/flow_offload.c
+++ b/net/core/flow_offload.c
@@ -7,8 +7,7 @@ struct flow_rule *flow_rule_alloc(unsigned int num_actions)
{
struct flow_rule *rule;
- rule = kzalloc(sizeof(struct flow_rule) +
- sizeof(struct flow_action_entry) * num_actions,
+ rule = kzalloc(struct_size(rule, action.entries, num_actions),
GFP_KERNEL);
if (!rule)
return NULL;
@@ -26,6 +25,13 @@ EXPORT_SYMBOL(flow_rule_alloc);
(__out)->key = skb_flow_dissector_target(__d, __type, (__m)->key); \
(__out)->mask = skb_flow_dissector_target(__d, __type, (__m)->mask); \
+void flow_rule_match_meta(const struct flow_rule *rule,
+ struct flow_match_meta *out)
+{
+ FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_META, out);
+}
+EXPORT_SYMBOL(flow_rule_match_meta);
+
void flow_rule_match_basic(const struct flow_rule *rule,
struct flow_match_basic *out)
{
@@ -158,3 +164,121 @@ void flow_rule_match_enc_opts(const struct flow_rule *rule,
FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_ENC_OPTS, out);
}
EXPORT_SYMBOL(flow_rule_match_enc_opts);
+
+struct flow_block_cb *flow_block_cb_alloc(struct net *net, tc_setup_cb_t *cb,
+ void *cb_ident, void *cb_priv,
+ void (*release)(void *cb_priv))
+{
+ struct flow_block_cb *block_cb;
+
+ block_cb = kzalloc(sizeof(*block_cb), GFP_KERNEL);
+ if (!block_cb)
+ return ERR_PTR(-ENOMEM);
+
+ block_cb->net = net;
+ block_cb->cb = cb;
+ block_cb->cb_ident = cb_ident;
+ block_cb->cb_priv = cb_priv;
+ block_cb->release = release;
+
+ return block_cb;
+}
+EXPORT_SYMBOL(flow_block_cb_alloc);
+
+void flow_block_cb_free(struct flow_block_cb *block_cb)
+{
+ if (block_cb->release)
+ block_cb->release(block_cb->cb_priv);
+
+ kfree(block_cb);
+}
+EXPORT_SYMBOL(flow_block_cb_free);
+
+struct flow_block_cb *flow_block_cb_lookup(struct flow_block_offload *f,
+ tc_setup_cb_t *cb, void *cb_ident)
+{
+ struct flow_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, f->driver_block_list, driver_list) {
+ if (block_cb->net == f->net &&
+ block_cb->cb == cb &&
+ block_cb->cb_ident == cb_ident)
+ return block_cb;
+ }
+
+ return NULL;
+}
+EXPORT_SYMBOL(flow_block_cb_lookup);
+
+void *flow_block_cb_priv(struct flow_block_cb *block_cb)
+{
+ return block_cb->cb_priv;
+}
+EXPORT_SYMBOL(flow_block_cb_priv);
+
+void flow_block_cb_incref(struct flow_block_cb *block_cb)
+{
+ block_cb->refcnt++;
+}
+EXPORT_SYMBOL(flow_block_cb_incref);
+
+unsigned int flow_block_cb_decref(struct flow_block_cb *block_cb)
+{
+ return --block_cb->refcnt;
+}
+EXPORT_SYMBOL(flow_block_cb_decref);
+
+bool flow_block_cb_is_busy(tc_setup_cb_t *cb, void *cb_ident,
+ struct list_head *driver_block_list)
+{
+ struct flow_block_cb *block_cb;
+
+ list_for_each_entry(block_cb, driver_block_list, driver_list) {
+ if (block_cb->cb == cb &&
+ block_cb->cb_ident == cb_ident)
+ return true;
+ }
+
+ return false;
+}
+EXPORT_SYMBOL(flow_block_cb_is_busy);
+
+int flow_block_cb_setup_simple(struct flow_block_offload *f,
+ struct list_head *driver_block_list,
+ tc_setup_cb_t *cb, void *cb_ident, void *cb_priv,
+ bool ingress_only)
+{
+ struct flow_block_cb *block_cb;
+
+ if (ingress_only &&
+ f->binder_type != FLOW_BLOCK_BINDER_TYPE_CLSACT_INGRESS)
+ return -EOPNOTSUPP;
+
+ f->driver_block_list = driver_block_list;
+
+ switch (f->command) {
+ case FLOW_BLOCK_BIND:
+ if (flow_block_cb_is_busy(cb, cb_ident, driver_block_list))
+ return -EBUSY;
+
+ block_cb = flow_block_cb_alloc(f->net, cb, cb_ident,
+ cb_priv, NULL);
+ if (IS_ERR(block_cb))
+ return PTR_ERR(block_cb);
+
+ flow_block_cb_add(block_cb, f);
+ list_add_tail(&block_cb->driver_list, driver_block_list);
+ return 0;
+ case FLOW_BLOCK_UNBIND:
+ block_cb = flow_block_cb_lookup(f, cb, cb_ident);
+ if (!block_cb)
+ return -ENOENT;
+
+ flow_block_cb_remove(block_cb, f);
+ list_del(&block_cb->driver_list);
+ return 0;
+ default:
+ return -EOPNOTSUPP;
+ }
+}
+EXPORT_SYMBOL(flow_block_cb_setup_simple);
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
index fd822ca5a245..ac1a66df9adc 100644
--- a/net/core/hwbm.c
+++ b/net/core/hwbm.c
@@ -43,34 +43,33 @@ int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
}
EXPORT_SYMBOL_GPL(hwbm_pool_refill);
-int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num)
{
int err, i;
- unsigned long flags;
- spin_lock_irqsave(&bm_pool->lock, flags);
+ mutex_lock(&bm_pool->buf_lock);
if (bm_pool->buf_num == bm_pool->size) {
pr_warn("pool already filled\n");
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return bm_pool->buf_num;
}
if (buf_num + bm_pool->buf_num > bm_pool->size) {
pr_warn("cannot allocate %d buffers for pool\n",
buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return 0;
}
if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
buf_num, bm_pool->buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return 0;
}
for (i = 0; i < buf_num; i++) {
- err = hwbm_pool_refill(bm_pool, gfp);
+ err = hwbm_pool_refill(bm_pool, GFP_KERNEL);
if (err < 0)
break;
}
@@ -79,7 +78,7 @@ int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
bm_pool->buf_num += i;
pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
- spin_unlock_irqrestore(&bm_pool->lock, flags);
+ mutex_unlock(&bm_pool->buf_lock);
return i;
}
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 04fdc9535772..f153e0601838 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -163,9 +163,16 @@ static void linkwatch_do_dev(struct net_device *dev)
static void __linkwatch_run_queue(int urgent_only)
{
+#define MAX_DO_DEV_PER_LOOP 100
+
+ int do_dev = MAX_DO_DEV_PER_LOOP;
struct net_device *dev;
LIST_HEAD(wrk);
+ /* Give urgent case more budget */
+ if (urgent_only)
+ do_dev += MAX_DO_DEV_PER_LOOP;
+
/*
* Limit the number of linkwatch events to one
* per second so that a runaway driver does not
@@ -184,7 +191,7 @@ static void __linkwatch_run_queue(int urgent_only)
spin_lock_irq(&lweventlist_lock);
list_splice_init(&lweventlist, &wrk);
- while (!list_empty(&wrk)) {
+ while (!list_empty(&wrk) && do_dev > 0) {
dev = list_first_entry(&wrk, struct net_device, link_watch_list);
list_del_init(&dev->link_watch_list);
@@ -195,9 +202,13 @@ static void __linkwatch_run_queue(int urgent_only)
}
spin_unlock_irq(&lweventlist_lock);
linkwatch_do_dev(dev);
+ do_dev--;
spin_lock_irq(&lweventlist_lock);
}
+ /* Add the remaining work back to lweventlist */
+ list_splice_init(&wrk, &lweventlist);
+
if (!list_empty(&lweventlist))
linkwatch_schedule_work(0);
spin_unlock_irq(&lweventlist_lock);
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 9e7fc929bc50..742cea4ce72e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -583,6 +583,8 @@ static struct neighbour *___neigh_create(struct neigh_table *tbl,
int error;
struct neigh_hash_table *nht;
+ trace_neigh_create(tbl, dev, pkey, n, exempt_from_gc);
+
if (!n) {
rc = ERR_PTR(-ENOBUFS);
goto out;
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 470b179d599e..283ddb2dbc7d 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -43,6 +43,10 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(fdb_delete);
EXPORT_TRACEPOINT_SYMBOL_GPL(br_fdb_update);
#endif
+#if IS_ENABLED(CONFIG_PAGE_POOL)
+#include <trace/events/page_pool.h>
+#endif
+
#include <trace/events/neigh.h>
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update);
EXPORT_TRACEPOINT_SYMBOL_GPL(neigh_update_done);
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index f7b6dda798e0..a0e0d298c991 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -152,6 +152,17 @@ static void ops_free(const struct pernet_operations *ops, struct net *net)
}
}
+static void ops_pre_exit_list(const struct pernet_operations *ops,
+ struct list_head *net_exit_list)
+{
+ struct net *net;
+
+ if (ops->pre_exit) {
+ list_for_each_entry(net, net_exit_list, exit_list)
+ ops->pre_exit(net);
+ }
+}
+
static void ops_exit_list(const struct pernet_operations *ops,
struct list_head *net_exit_list)
{
@@ -337,6 +348,12 @@ out_undo:
list_add(&net->exit_list, &net_exit_list);
saved_ops = ops;
list_for_each_entry_continue_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
+ synchronize_rcu();
+
+ ops = saved_ops;
+ list_for_each_entry_continue_reverse(ops, &pernet_list, list)
ops_exit_list(ops, &net_exit_list);
ops = saved_ops;
@@ -560,10 +577,15 @@ static void cleanup_net(struct work_struct *work)
list_add_tail(&net->exit_list, &net_exit_list);
}
+ /* Run all of the network namespace pre_exit methods */
+ list_for_each_entry_reverse(ops, &pernet_list, list)
+ ops_pre_exit_list(ops, &net_exit_list);
+
/*
* Another CPU might be rcu-iterating the list, wait for it.
* This needs to be before calling the exit() notifiers, so
* the rcu_barrier() below isn't sufficient alone.
+ * Also the pre_exit() and exit() methods need this barrier.
*/
synchronize_rcu();
@@ -1121,6 +1143,8 @@ static int __register_pernet_operations(struct list_head *list,
out_undo:
/* If I have an error cleanup all namespaces I initialized */
list_del(&ops->list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
return error;
@@ -1135,6 +1159,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
/* See comment in __register_pernet_operations() */
for_each_net(net)
list_add_tail(&net->exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
@@ -1159,6 +1185,8 @@ static void __unregister_pernet_operations(struct pernet_operations *ops)
} else {
LIST_HEAD(net_exit_list);
list_add(&init_net.exit_list, &net_exit_list);
+ ops_pre_exit_list(ops, &net_exit_list);
+ synchronize_rcu();
ops_exit_list(ops, &net_exit_list);
ops_free_list(ops, &net_exit_list);
}
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index dd8b1a460d64..2cf27da1baeb 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -696,16 +696,22 @@ int netpoll_setup(struct netpoll *np)
if (!np->local_ip.ip) {
if (!np->ipv6) {
+ const struct in_ifaddr *ifa;
+
in_dev = __in_dev_get_rtnl(ndev);
+ if (!in_dev)
+ goto put_noaddr;
- if (!in_dev || !in_dev->ifa_list) {
+ ifa = rtnl_dereference(in_dev->ifa_list);
+ if (!ifa) {
+put_noaddr:
np_err(np, "no IP address for %s, aborting\n",
np->dev_name);
err = -EDESTADDRREQ;
goto put;
}
- np->local_ip.ip = in_dev->ifa_list->ifa_local;
+ np->local_ip.ip = ifa->ifa_local;
np_info(np, "local IP %pI4\n", &np->local_ip.ip);
} else {
#if IS_ENABLED(CONFIG_IPV6)
diff --git a/net/core/page_pool.c b/net/core/page_pool.c
index 5b2252c6d49b..3272dc7a8c81 100644
--- a/net/core/page_pool.c
+++ b/net/core/page_pool.c
@@ -4,9 +4,11 @@
* Author: Jesper Dangaard Brouer <netoptimizer@brouer.com>
* Copyright (C) 2016 Red Hat, Inc.
*/
+
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/slab.h>
+#include <linux/device.h>
#include <net/page_pool.h>
#include <linux/dma-direction.h>
@@ -14,6 +16,8 @@
#include <linux/page-flags.h>
#include <linux/mm.h> /* for __put_page() */
+#include <trace/events/page_pool.h>
+
static int page_pool_init(struct page_pool *pool,
const struct page_pool_params *params)
{
@@ -43,6 +47,14 @@ static int page_pool_init(struct page_pool *pool,
if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
return -ENOMEM;
+ atomic_set(&pool->pages_state_release_cnt, 0);
+
+ /* Driver calling page_pool_create() also call page_pool_destroy() */
+ refcount_set(&pool->user_cnt, 1);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ get_device(pool->p.dev);
+
return 0;
}
@@ -61,6 +73,7 @@ struct page_pool *page_pool_create(const struct page_pool_params *params)
kfree(pool);
return ERR_PTR(err);
}
+
return pool;
}
EXPORT_SYMBOL(page_pool_create);
@@ -151,6 +164,11 @@ static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
page->dma_addr = dma;
skip_dma_map:
+ /* Track how many pages are held 'in-flight' */
+ pool->pages_state_hold_cnt++;
+
+ trace_page_pool_state_hold(pool, page, pool->pages_state_hold_cnt);
+
/* When page just alloc'ed is should/must have refcnt 1. */
return page;
}
@@ -173,6 +191,33 @@ struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
}
EXPORT_SYMBOL(page_pool_alloc_pages);
+/* Calculate distance between two u32 values, valid if distance is below 2^(31)
+ * https://en.wikipedia.org/wiki/Serial_number_arithmetic#General_Solution
+ */
+#define _distance(a, b) (s32)((a) - (b))
+
+static s32 page_pool_inflight(struct page_pool *pool)
+{
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 distance;
+
+ distance = _distance(hold_cnt, release_cnt);
+
+ trace_page_pool_inflight(pool, distance, hold_cnt, release_cnt);
+ return distance;
+}
+
+static bool __page_pool_safe_to_destroy(struct page_pool *pool)
+{
+ s32 inflight = page_pool_inflight(pool);
+
+ /* The distance should not be able to become negative */
+ WARN(inflight < 0, "Negative(%d) inflight packet-pages", inflight);
+
+ return (inflight == 0);
+}
+
/* Cleanup page_pool state from page */
static void __page_pool_clean_page(struct page_pool *pool,
struct page *page)
@@ -180,7 +225,7 @@ static void __page_pool_clean_page(struct page_pool *pool,
dma_addr_t dma;
if (!(pool->p.flags & PP_FLAG_DMA_MAP))
- return;
+ goto skip_dma_unmap;
dma = page->dma_addr;
/* DMA unmap */
@@ -188,12 +233,27 @@ static void __page_pool_clean_page(struct page_pool *pool,
PAGE_SIZE << pool->p.order, pool->p.dma_dir,
DMA_ATTR_SKIP_CPU_SYNC);
page->dma_addr = 0;
+skip_dma_unmap:
+ atomic_inc(&pool->pages_state_release_cnt);
+ trace_page_pool_state_release(pool, page,
+ atomic_read(&pool->pages_state_release_cnt));
+}
+
+/* unmap the page and clean our state */
+void page_pool_unmap_page(struct page_pool *pool, struct page *page)
+{
+ /* When page is unmapped, this implies page will not be
+ * returned to page_pool.
+ */
+ __page_pool_clean_page(pool, page);
}
+EXPORT_SYMBOL(page_pool_unmap_page);
/* Return a page to the page allocator, cleaning up our state */
static void __page_pool_return_page(struct page_pool *pool, struct page *page)
{
__page_pool_clean_page(pool, page);
+
put_page(page);
/* An optimization would be to call __free_pages(page, pool->p.order)
* knowing page is not part of page-cache (thus avoiding a
@@ -285,21 +345,45 @@ static void __page_pool_empty_ring(struct page_pool *pool)
}
}
-static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+static void __warn_in_flight(struct page_pool *pool)
{
- struct page_pool *pool;
+ u32 release_cnt = atomic_read(&pool->pages_state_release_cnt);
+ u32 hold_cnt = READ_ONCE(pool->pages_state_hold_cnt);
+ s32 distance;
- pool = container_of(rcu, struct page_pool, rcu);
+ distance = _distance(hold_cnt, release_cnt);
+
+ /* Drivers should fix this, but only problematic when DMA is used */
+ WARN(1, "Still in-flight pages:%d hold:%u released:%u",
+ distance, hold_cnt, release_cnt);
+}
+
+void __page_pool_free(struct page_pool *pool)
+{
+ /* Only last user actually free/release resources */
+ if (!page_pool_put(pool))
+ return;
WARN(pool->alloc.count, "API usage violation");
+ WARN(!ptr_ring_empty(&pool->ring), "ptr_ring is not empty");
+
+ /* Can happen due to forced shutdown */
+ if (!__page_pool_safe_to_destroy(pool))
+ __warn_in_flight(pool);
- __page_pool_empty_ring(pool);
ptr_ring_cleanup(&pool->ring, NULL);
+
+ if (pool->p.flags & PP_FLAG_DMA_MAP)
+ put_device(pool->p.dev);
+
kfree(pool);
}
+EXPORT_SYMBOL(__page_pool_free);
-/* Cleanup and release resources */
-void page_pool_destroy(struct page_pool *pool)
+/* Request to shutdown: release pages cached by page_pool, and check
+ * for in-flight pages
+ */
+bool __page_pool_request_shutdown(struct page_pool *pool)
{
struct page *page;
@@ -317,7 +401,6 @@ void page_pool_destroy(struct page_pool *pool)
*/
__page_pool_empty_ring(pool);
- /* An xdp_mem_allocator can still ref page_pool pointer */
- call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+ return __page_pool_safe_to_destroy(pool);
}
-EXPORT_SYMBOL(page_pool_destroy);
+EXPORT_SYMBOL(__page_pool_request_shutdown);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index f975c5e2a369..bb9915291644 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2118,9 +2118,11 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
rcu_read_lock();
in_dev = __in_dev_get_rcu(pkt_dev->odev);
if (in_dev) {
- if (in_dev->ifa_list) {
- pkt_dev->saddr_min =
- in_dev->ifa_list->ifa_address;
+ const struct in_ifaddr *ifa;
+
+ ifa = rcu_dereference(in_dev->ifa_list);
+ if (ifa) {
+ pkt_dev->saddr_min = ifa->ifa_address;
pkt_dev->saddr_max = pkt_dev->saddr_min;
}
}
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index cec60583931f..1ee6460f8275 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -751,6 +751,10 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 *metrics)
struct nlattr *mx;
int i, valid = 0;
+ /* nothing is dumped for dst_default_metrics, so just skip the loop */
+ if (metrics == dst_default_metrics.metrics)
+ return 0;
+
mx = nla_nest_start_noflag(skb, RTA_METRICS);
if (mx == NULL)
return -ENOBUFS;
@@ -908,6 +912,7 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev,
size += num_vfs *
(nla_total_size(0) +
nla_total_size(sizeof(struct ifla_vf_mac)) +
+ nla_total_size(sizeof(struct ifla_vf_broadcast)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
nla_total_size(0) + /* nest IFLA_VF_VLAN_LIST */
nla_total_size(MAX_VLAN_LIST_LEN *
@@ -1197,6 +1202,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_rate vf_rate;
struct ifla_vf_mac vf_mac;
+ struct ifla_vf_broadcast vf_broadcast;
struct ifla_vf_info ivi;
memset(&ivi, 0, sizeof(ivi));
@@ -1231,6 +1237,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
vf_trust.vf = ivi.vf;
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ memcpy(vf_broadcast.broadcast, dev->broadcast, dev->addr_len);
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
vf_vlan_info.vlan = ivi.vlan;
@@ -1247,6 +1254,7 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
if (!vf)
goto nla_put_vfinfo_failure;
if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_BROADCAST, sizeof(vf_broadcast), &vf_broadcast) ||
nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
&vf_rate) ||
@@ -1753,6 +1761,7 @@ static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = {
static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_MAC] = { .len = sizeof(struct ifla_vf_mac) },
+ [IFLA_VF_BROADCAST] = { .type = NLA_REJECT },
[IFLA_VF_VLAN] = { .len = sizeof(struct ifla_vf_vlan) },
[IFLA_VF_VLAN_LIST] = { .type = NLA_NESTED },
[IFLA_VF_TX_RATE] = { .len = sizeof(struct ifla_vf_tx_rate) },
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c8cd99c3603f..6f1e31f674a3 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -59,6 +59,7 @@
#include <linux/errqueue.h>
#include <linux/prefetch.h>
#include <linux/if_vlan.h>
+#include <linux/mpls.h>
#include <net/protocol.h>
#include <net/dst.h>
@@ -66,12 +67,14 @@
#include <net/checksum.h>
#include <net/ip6_checksum.h>
#include <net/xfrm.h>
+#include <net/mpls.h>
#include <linux/uaccess.h>
#include <trace/events/skb.h>
#include <linux/highmem.h>
#include <linux/capability.h>
#include <linux/user_namespace.h>
+#include <linux/indirect_call_wrapper.h>
#include "datagram.h"
@@ -365,18 +368,20 @@ struct napi_alloc_cache {
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
-static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
+static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct page_frag_cache *nc;
- unsigned long flags;
- void *data;
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- local_irq_save(flags);
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, fragsz, gfp_mask);
- local_irq_restore(flags);
- return data;
+ return page_frag_alloc(&nc->page, fragsz, gfp_mask);
+}
+
+void *napi_alloc_frag(unsigned int fragsz)
+{
+ fragsz = SKB_DATA_ALIGN(fragsz);
+
+ return __napi_alloc_frag(fragsz, GFP_ATOMIC);
}
+EXPORT_SYMBOL(napi_alloc_frag);
/**
* netdev_alloc_frag - allocate a page fragment
@@ -387,26 +392,21 @@ static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
*/
void *netdev_alloc_frag(unsigned int fragsz)
{
- fragsz = SKB_DATA_ALIGN(fragsz);
-
- return __netdev_alloc_frag(fragsz, GFP_ATOMIC);
-}
-EXPORT_SYMBOL(netdev_alloc_frag);
-
-static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
-{
- struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
-
- return page_frag_alloc(&nc->page, fragsz, gfp_mask);
-}
+ struct page_frag_cache *nc;
+ void *data;
-void *napi_alloc_frag(unsigned int fragsz)
-{
fragsz = SKB_DATA_ALIGN(fragsz);
-
- return __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ if (in_irq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, fragsz, GFP_ATOMIC);
+ } else {
+ local_bh_disable();
+ data = __napi_alloc_frag(fragsz, GFP_ATOMIC);
+ local_bh_enable();
+ }
+ return data;
}
-EXPORT_SYMBOL(napi_alloc_frag);
+EXPORT_SYMBOL(netdev_alloc_frag);
/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
@@ -425,7 +425,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
gfp_t gfp_mask)
{
struct page_frag_cache *nc;
- unsigned long flags;
struct sk_buff *skb;
bool pfmemalloc;
void *data;
@@ -446,13 +445,17 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- local_irq_save(flags);
-
- nc = this_cpu_ptr(&netdev_alloc_cache);
- data = page_frag_alloc(nc, len, gfp_mask);
- pfmemalloc = nc->pfmemalloc;
-
- local_irq_restore(flags);
+ if (in_irq() || irqs_disabled()) {
+ nc = this_cpu_ptr(&netdev_alloc_cache);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
+ } else {
+ local_bh_disable();
+ nc = this_cpu_ptr(&napi_alloc_cache.page);
+ data = page_frag_alloc(nc, len, gfp_mask);
+ pfmemalloc = nc->pfmemalloc;
+ local_bh_enable();
+ }
if (unlikely(!data))
return NULL;
@@ -706,6 +709,105 @@ void kfree_skb_list(struct sk_buff *segs)
}
EXPORT_SYMBOL(kfree_skb_list);
+/* Dump skb information and contents.
+ *
+ * Must only be called from net_ratelimit()-ed paths.
+ *
+ * Dumps up to can_dump_full whole packets if full_pkt, headers otherwise.
+ */
+void skb_dump(const char *level, const struct sk_buff *skb, bool full_pkt)
+{
+ static atomic_t can_dump_full = ATOMIC_INIT(5);
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ struct net_device *dev = skb->dev;
+ struct sock *sk = skb->sk;
+ struct sk_buff *list_skb;
+ bool has_mac, has_trans;
+ int headroom, tailroom;
+ int i, len, seg_len;
+
+ if (full_pkt)
+ full_pkt = atomic_dec_if_positive(&can_dump_full) >= 0;
+
+ if (full_pkt)
+ len = skb->len;
+ else
+ len = min_t(int, skb->len, MAX_HEADER + 128);
+
+ headroom = skb_headroom(skb);
+ tailroom = skb_tailroom(skb);
+
+ has_mac = skb_mac_header_was_set(skb);
+ has_trans = skb_transport_header_was_set(skb);
+
+ printk("%sskb len=%u headroom=%u headlen=%u tailroom=%u\n"
+ "mac=(%d,%d) net=(%d,%d) trans=%d\n"
+ "shinfo(txflags=%u nr_frags=%u gso(size=%hu type=%u segs=%hu))\n"
+ "csum(0x%x ip_summed=%u complete_sw=%u valid=%u level=%u)\n"
+ "hash(0x%x sw=%u l4=%u) proto=0x%04x pkttype=%u iif=%d\n",
+ level, skb->len, headroom, skb_headlen(skb), tailroom,
+ has_mac ? skb->mac_header : -1,
+ has_mac ? skb_mac_header_len(skb) : -1,
+ skb->network_header,
+ has_trans ? skb_network_header_len(skb) : -1,
+ has_trans ? skb->transport_header : -1,
+ sh->tx_flags, sh->nr_frags,
+ sh->gso_size, sh->gso_type, sh->gso_segs,
+ skb->csum, skb->ip_summed, skb->csum_complete_sw,
+ skb->csum_valid, skb->csum_level,
+ skb->hash, skb->sw_hash, skb->l4_hash,
+ ntohs(skb->protocol), skb->pkt_type, skb->skb_iif);
+
+ if (dev)
+ printk("%sdev name=%s feat=0x%pNF\n",
+ level, dev->name, &dev->features);
+ if (sk)
+ printk("%ssk family=%hu type=%hu proto=%hu\n",
+ level, sk->sk_family, sk->sk_type, sk->sk_protocol);
+
+ if (full_pkt && headroom)
+ print_hex_dump(level, "skb headroom: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->head, headroom, false);
+
+ seg_len = min_t(int, skb_headlen(skb), len);
+ if (seg_len)
+ print_hex_dump(level, "skb linear: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb->data, seg_len, false);
+ len -= seg_len;
+
+ if (full_pkt && tailroom)
+ print_hex_dump(level, "skb tailroom: ", DUMP_PREFIX_OFFSET,
+ 16, 1, skb_tail_pointer(skb), tailroom, false);
+
+ for (i = 0; len && i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ u32 p_off, p_len, copied;
+ struct page *p;
+ u8 *vaddr;
+
+ skb_frag_foreach_page(frag, frag->page_offset,
+ skb_frag_size(frag), p, p_off, p_len,
+ copied) {
+ seg_len = min_t(int, p_len, len);
+ vaddr = kmap_atomic(p);
+ print_hex_dump(level, "skb frag: ",
+ DUMP_PREFIX_OFFSET,
+ 16, 1, vaddr + p_off, seg_len, false);
+ kunmap_atomic(vaddr);
+ len -= seg_len;
+ if (!len)
+ break;
+ }
+ }
+
+ if (full_pkt && skb_has_frag_list(skb)) {
+ printk("skb fraglist:\n");
+ skb_walk_frags(skb, list_skb)
+ skb_dump(level, list_skb, true);
+ }
+}
+EXPORT_SYMBOL(skb_dump);
+
/**
* skb_tx_error - report an sk_buff xmit error
* @skb: buffer that triggered an error
@@ -909,6 +1011,31 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
}
/**
+ * alloc_skb_for_msg() - allocate sk_buff to wrap frag list forming a msg
+ * @first: first sk_buff of the msg
+ */
+struct sk_buff *alloc_skb_for_msg(struct sk_buff *first)
+{
+ struct sk_buff *n;
+
+ n = alloc_skb(0, GFP_ATOMIC);
+ if (!n)
+ return NULL;
+
+ n->len = first->len;
+ n->data_len = first->len;
+ n->truesize = first->truesize;
+
+ skb_shinfo(n)->frag_list = first;
+
+ __copy_skb_header(n, first);
+ n->destructor = NULL;
+
+ return n;
+}
+EXPORT_SYMBOL_GPL(alloc_skb_for_msg);
+
+/**
* skb_morph - morph one skb into another
* @dst: the skb to receive the contents
* @src: the skb to supply the contents
@@ -2508,7 +2635,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
if (copy > 0) {
if (copy > len)
copy = len;
- csum = ops->update(skb->data + offset, copy, csum);
+ csum = INDIRECT_CALL_1(ops->update, csum_partial_ext,
+ skb->data + offset, copy, csum);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -2535,9 +2663,13 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
frag->page_offset + offset - start,
copy, p, p_off, p_len, copied) {
vaddr = kmap_atomic(p);
- csum2 = ops->update(vaddr + p_off, p_len, 0);
+ csum2 = INDIRECT_CALL_1(ops->update,
+ csum_partial_ext,
+ vaddr + p_off, p_len, 0);
kunmap_atomic(vaddr);
- csum = ops->combine(csum, csum2, pos, p_len);
+ csum = INDIRECT_CALL_1(ops->combine,
+ csum_block_add_ext, csum,
+ csum2, pos, p_len);
pos += p_len;
}
@@ -2560,7 +2692,8 @@ __wsum __skb_checksum(const struct sk_buff *skb, int offset, int len,
copy = len;
csum2 = __skb_checksum(frag_iter, offset - start,
copy, 0, ops);
- csum = ops->combine(csum, csum2, pos, copy);
+ csum = INDIRECT_CALL_1(ops->combine, csum_block_add_ext,
+ csum, csum2, pos, copy);
if ((len -= copy) == 0)
return csum;
offset += copy;
@@ -5294,6 +5427,173 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
}
EXPORT_SYMBOL(skb_vlan_push);
+/* Update the ethertype of hdr and the skb csum value if required. */
+static void skb_mod_eth_type(struct sk_buff *skb, struct ethhdr *hdr,
+ __be16 ethertype)
+{
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be16 diff[] = { ~hdr->h_proto, ethertype };
+
+ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+ }
+
+ hdr->h_proto = ethertype;
+}
+
+/**
+ * skb_mpls_push() - push a new MPLS header after the mac header
+ *
+ * @skb: buffer
+ * @mpls_lse: MPLS label stack entry to push
+ * @mpls_proto: ethertype of the new MPLS header (expects 0x8847 or 0x8848)
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_push(struct sk_buff *skb, __be32 mpls_lse, __be16 mpls_proto)
+{
+ struct mpls_shim_hdr *lse;
+ int err;
+
+ if (unlikely(!eth_p_mpls(mpls_proto)))
+ return -EINVAL;
+
+ /* Networking stack does not allow simultaneous Tunnel and MPLS GSO. */
+ if (skb->encapsulation)
+ return -EINVAL;
+
+ err = skb_cow_head(skb, MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (!skb->inner_protocol) {
+ skb_set_inner_network_header(skb, skb->mac_len);
+ skb_set_inner_protocol(skb, skb->protocol);
+ }
+
+ skb_push(skb, MPLS_HLEN);
+ memmove(skb_mac_header(skb) - MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->mac_len);
+
+ lse = mpls_hdr(skb);
+ lse->label_stack_entry = mpls_lse;
+ skb_postpush_rcsum(skb, lse, MPLS_HLEN);
+
+ if (skb->dev && skb->dev->type == ARPHRD_ETHER)
+ skb_mod_eth_type(skb, eth_hdr(skb), mpls_proto);
+ skb->protocol = mpls_proto;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_push);
+
+/**
+ * skb_mpls_pop() - pop the outermost MPLS header
+ *
+ * @skb: buffer
+ * @next_proto: ethertype of header after popped MPLS header
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_pop(struct sk_buff *skb, __be16 next_proto)
+{
+ int err;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ skb_postpull_rcsum(skb, mpls_hdr(skb), MPLS_HLEN);
+ memmove(skb_mac_header(skb) + MPLS_HLEN, skb_mac_header(skb),
+ skb->mac_len);
+
+ __skb_pull(skb, MPLS_HLEN);
+ skb_reset_mac_header(skb);
+ skb_set_network_header(skb, skb->mac_len);
+
+ if (skb->dev && skb->dev->type == ARPHRD_ETHER) {
+ struct ethhdr *hdr;
+
+ /* use mpls_hdr() to get ethertype to account for VLANs. */
+ hdr = (struct ethhdr *)((void *)mpls_hdr(skb) - ETH_HLEN);
+ skb_mod_eth_type(skb, hdr, next_proto);
+ }
+ skb->protocol = next_proto;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_pop);
+
+/**
+ * skb_mpls_update_lse() - modify outermost MPLS header and update csum
+ *
+ * @skb: buffer
+ * @mpls_lse: new MPLS label stack entry to update to
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_update_lse(struct sk_buff *skb, __be32 mpls_lse)
+{
+ int err;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ err = skb_ensure_writable(skb, skb->mac_len + MPLS_HLEN);
+ if (unlikely(err))
+ return err;
+
+ if (skb->ip_summed == CHECKSUM_COMPLETE) {
+ __be32 diff[] = { ~mpls_hdr(skb)->label_stack_entry, mpls_lse };
+
+ skb->csum = csum_partial((char *)diff, sizeof(diff), skb->csum);
+ }
+
+ mpls_hdr(skb)->label_stack_entry = mpls_lse;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(skb_mpls_update_lse);
+
+/**
+ * skb_mpls_dec_ttl() - decrement the TTL of the outermost MPLS header
+ *
+ * @skb: buffer
+ *
+ * Expects skb->data at mac header.
+ *
+ * Returns 0 on success, -errno otherwise.
+ */
+int skb_mpls_dec_ttl(struct sk_buff *skb)
+{
+ u32 lse;
+ u8 ttl;
+
+ if (unlikely(!eth_p_mpls(skb->protocol)))
+ return -EINVAL;
+
+ lse = be32_to_cpu(mpls_hdr(skb)->label_stack_entry);
+ ttl = (lse & MPLS_LS_TTL_MASK) >> MPLS_LS_TTL_SHIFT;
+ if (!--ttl)
+ return -EINVAL;
+
+ lse &= ~MPLS_LS_TTL_MASK;
+ lse |= ttl << MPLS_LS_TTL_SHIFT;
+
+ return skb_mpls_update_lse(skb, cpu_to_be32(lse));
+}
+EXPORT_SYMBOL_GPL(skb_mpls_dec_ttl);
+
/**
* alloc_skb_with_frags - allocate skb with page frags
*
diff --git a/net/core/sock.c b/net/core/sock.c
index aa4a00d381e3..3e073ca6138f 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1039,6 +1039,10 @@ set_rcvbuf:
}
break;
+ case SO_DETACH_REUSEPORT_BPF:
+ ret = reuseport_detach_prog(sk);
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -2843,7 +2847,7 @@ void sock_init_data(struct socket *sock, struct sock *sk)
if (sock) {
sk->sk_type = sock->type;
- RCU_INIT_POINTER(sk->sk_wq, sock->wq);
+ RCU_INIT_POINTER(sk->sk_wq, &sock->wq);
sock->sk = sk;
sk->sk_uid = SOCK_INODE(sock)->i_uid;
} else {
diff --git a/net/core/sock_map.c b/net/core/sock_map.c
index be6092ac69f8..52d4faeee18b 100644
--- a/net/core/sock_map.c
+++ b/net/core/sock_map.c
@@ -44,13 +44,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
/* Make sure page count doesn't overflow. */
cost = (u64) stab->map.max_entries * sizeof(struct sock *);
- if (cost >= U32_MAX - PAGE_SIZE) {
- err = -EINVAL;
- goto free_stab;
- }
-
- stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
- err = bpf_map_precharge_memlock(stab->map.pages);
+ err = bpf_map_charge_init(&stab->map.memory, cost);
if (err)
goto free_stab;
@@ -60,6 +54,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
if (stab->sks)
return &stab->map;
err = -ENOMEM;
+ bpf_map_charge_finish(&stab->map.memory);
free_stab:
kfree(stab);
return ERR_PTR(err);
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
index dc4aefdf2a08..9408f9264d05 100644
--- a/net/core/sock_reuseport.c
+++ b/net/core/sock_reuseport.c
@@ -332,3 +332,27 @@ int reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
return 0;
}
EXPORT_SYMBOL(reuseport_attach_prog);
+
+int reuseport_detach_prog(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return sk->sk_reuseport ? -ENOENT : -EINVAL;
+
+ old_prog = NULL;
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ rcu_swap_protected(reuse->prog, old_prog,
+ lockdep_is_held(&reuseport_lock));
+ spin_unlock_bh(&reuseport_lock);
+
+ if (!old_prog)
+ return -ENOENT;
+
+ sk_reuseport_prog_free(old_prog);
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_detach_prog);
diff --git a/net/core/xdp.c b/net/core/xdp.c
index 8aab08b131d9..d7bf62ffbb5e 100644
--- a/net/core/xdp.c
+++ b/net/core/xdp.c
@@ -14,6 +14,8 @@
#include <net/page_pool.h>
#include <net/xdp.h>
+#include <net/xdp_priv.h> /* struct xdp_mem_allocator */
+#include <trace/events/xdp.h>
#define REG_STATE_NEW 0x0
#define REG_STATE_REGISTERED 0x1
@@ -29,17 +31,6 @@ static int mem_id_next = MEM_ID_MIN;
static bool mem_id_init; /* false */
static struct rhashtable *mem_id_ht;
-struct xdp_mem_allocator {
- struct xdp_mem_info mem;
- union {
- void *allocator;
- struct page_pool *page_pool;
- struct zero_copy_allocator *zc_alloc;
- };
- struct rhash_head node;
- struct rcu_head rcu;
-};
-
static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
{
const u32 *k = data;
@@ -79,13 +70,13 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+ /* Allocator have indicated safe to remove before this is called */
+ if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+ page_pool_free(xa->page_pool);
+
/* Allow this ID to be reused */
ida_simple_remove(&mem_id_pool, xa->mem.id);
- /* Notice, driver is expected to free the *allocator,
- * e.g. page_pool, and MUST also use RCU free.
- */
-
/* Poison memory */
xa->mem.id = 0xFFFF;
xa->mem.type = 0xF0F0;
@@ -94,6 +85,64 @@ static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
kfree(xa);
}
+static bool __mem_id_disconnect(int id, bool force)
+{
+ struct xdp_mem_allocator *xa;
+ bool safe_to_remove = true;
+
+ mutex_lock(&mem_id_lock);
+
+ xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ WARN(1, "Request remove non-existing id(%d), driver bug?", id);
+ return true;
+ }
+ xa->disconnect_cnt++;
+
+ /* Detects in-flight packet-pages for page_pool */
+ if (xa->mem.type == MEM_TYPE_PAGE_POOL)
+ safe_to_remove = page_pool_request_shutdown(xa->page_pool);
+
+ trace_mem_disconnect(xa, safe_to_remove, force);
+
+ if ((safe_to_remove || force) &&
+ !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
+ call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+ mutex_unlock(&mem_id_lock);
+ return (safe_to_remove|force);
+}
+
+#define DEFER_TIME (msecs_to_jiffies(1000))
+#define DEFER_WARN_INTERVAL (30 * HZ)
+#define DEFER_MAX_RETRIES 120
+
+static void mem_id_disconnect_defer_retry(struct work_struct *wq)
+{
+ struct delayed_work *dwq = to_delayed_work(wq);
+ struct xdp_mem_allocator *xa = container_of(dwq, typeof(*xa), defer_wq);
+ bool force = false;
+
+ if (xa->disconnect_cnt > DEFER_MAX_RETRIES)
+ force = true;
+
+ if (__mem_id_disconnect(xa->mem.id, force))
+ return;
+
+ /* Periodic warning */
+ if (time_after_eq(jiffies, xa->defer_warn)) {
+ int sec = (s32)((u32)jiffies - (u32)xa->defer_start) / HZ;
+
+ pr_warn("%s() stalled mem.id=%u shutdown %d attempts %d sec\n",
+ __func__, xa->mem.id, xa->disconnect_cnt, sec);
+ xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ }
+
+ /* Still not ready to be disconnected, retry later */
+ schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
+}
+
void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
{
struct xdp_mem_allocator *xa;
@@ -112,16 +161,30 @@ void xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
if (id == 0)
return;
+ if (__mem_id_disconnect(id, false))
+ return;
+
+ /* Could not disconnect, defer new disconnect attempt to later */
mutex_lock(&mem_id_lock);
xa = rhashtable_lookup_fast(mem_id_ht, &id, mem_id_rht_params);
- if (xa && !rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params))
- call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+ if (!xa) {
+ mutex_unlock(&mem_id_lock);
+ return;
+ }
+ xa->defer_start = jiffies;
+ xa->defer_warn = jiffies + DEFER_WARN_INTERVAL;
+ INIT_DELAYED_WORK(&xa->defer_wq, mem_id_disconnect_defer_retry);
mutex_unlock(&mem_id_lock);
+ schedule_delayed_work(&xa->defer_wq, DEFER_TIME);
}
EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg_mem_model);
+/* This unregister operation will also cleanup and destroy the
+ * allocator. The page_pool_free() operation is first called when it's
+ * safe to remove, possibly deferred to a workqueue.
+ */
void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
{
/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -301,12 +364,18 @@ int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
/* Insert allocator into ID lookup table */
ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
if (IS_ERR(ptr)) {
+ ida_simple_remove(&mem_id_pool, xdp_rxq->mem.id);
+ xdp_rxq->mem.id = 0;
errno = PTR_ERR(ptr);
goto err;
}
+ if (type == MEM_TYPE_PAGE_POOL)
+ page_pool_get(xdp_alloc->page_pool);
+
mutex_unlock(&mem_id_lock);
+ trace_mem_connect(xdp_alloc, xdp_rxq);
return 0;
err:
mutex_unlock(&mem_id_lock);
@@ -333,10 +402,13 @@ static void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct,
/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
page = virt_to_head_page(data);
- if (xa) {
+ if (likely(xa)) {
napi_direct &= !xdp_return_frame_no_direct();
page_pool_put_page(xa->page_pool, page, napi_direct);
} else {
+ /* Hopefully stack show who to blame for late return */
+ WARN_ONCE(1, "page_pool gone mem.id=%d", mem->id);
+ trace_mem_return_failed(mem, page);
put_page(page);
}
rcu_read_unlock();
@@ -379,6 +451,21 @@ void xdp_return_buff(struct xdp_buff *xdp)
}
EXPORT_SYMBOL_GPL(xdp_return_buff);
+/* Only called for MEM_TYPE_PAGE_POOL see xdp.h */
+void __xdp_release_frame(void *data, struct xdp_mem_info *mem)
+{
+ struct xdp_mem_allocator *xa;
+ struct page *page;
+
+ rcu_read_lock();
+ xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+ page = virt_to_head_page(data);
+ if (xa)
+ page_pool_release_page(xa->page_pool, page);
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(__xdp_release_frame);
+
int xdp_attachment_query(struct xdp_attachment_info *info,
struct netdev_bpf *bpf)
{