diff options
Diffstat (limited to 'kernel')
67 files changed, 1572 insertions, 1022 deletions
diff --git a/kernel/async.c b/kernel/async.c index 4c3773c0bf63..d2edd6efec56 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -326,3 +326,4 @@ bool current_is_async(void) return worker && worker->current_func == async_run_entry_fn; } +EXPORT_SYMBOL_GPL(current_is_async); diff --git a/kernel/audit.c b/kernel/audit.c index 5ffcbd354a52..3a3e5deeda8d 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -110,7 +110,6 @@ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; -static u32 audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ kuid_t audit_sig_uid = INVALID_UID; @@ -509,8 +508,7 @@ static void flush_hold_queue(void) * if auditd just disappeared but we * dequeued an skb we need to drop ref */ - if (skb) - consume_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) @@ -524,7 +522,8 @@ static int kauditd_thread(void *dummy) skb = skb_dequeue(&audit_skb_queue); if (skb) { - if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit) + if (!audit_backlog_limit || + (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)) wake_up(&audit_backlog_wait); if (audit_pid) kauditd_send_skb(skb); @@ -1232,9 +1231,7 @@ static void audit_buffer_free(struct audit_buffer *ab) if (!ab) return; - if (ab->skb) - kfree_skb(ab->skb); - + kfree_skb(ab->skb); spin_lock_irqsave(&audit_freelist_lock, flags); if (audit_freelist_count > AUDIT_MAXFREE) kfree(ab); @@ -1372,7 +1369,7 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, return NULL; if (gfp_mask & __GFP_DIRECT_RECLAIM) { - if (audit_pid && audit_pid == current->pid) + if (audit_pid && audit_pid == current->tgid) gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; @@ -1395,12 +1392,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, skb_queue_len(&audit_skb_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = audit_backlog_wait_overflow; + audit_backlog_wait_time = 0; wake_up(&audit_backlog_wait); return NULL; } - if (!reserve) + if (!reserve && !audit_backlog_wait_time) audit_backlog_wait_time = audit_backlog_wait_time_master; ab = audit_buffer_alloc(ctx, gfp_mask, type); @@ -1722,7 +1719,7 @@ static inline int audit_copy_fcaps(struct audit_names *name, /* Copy inode data into an audit_names. */ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode) + struct inode *inode) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; diff --git a/kernel/audit.h b/kernel/audit.h index de6cbb7cf547..cbbe6bb6496e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -207,7 +207,7 @@ extern u32 audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode); + struct inode *inode); extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); extern void audit_log_name(struct audit_context *context, diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 27c6046c2c3d..f84f8d06e1f6 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa if (IS_ERR(dentry)) return (void *)dentry; /* returning an error */ inode = path.dentry->d_inode; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..9f194aad0adc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex); + inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d_backing_inode(d)->i_sb->s_dev; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc04959de..195ffaee50b9 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = current->audit_context; - const struct inode *inode = d_backing_inode(dentry); + struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; @@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file) * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const struct inode *parent, +void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { struct audit_context *context = current->audit_context; - const struct inode *inode = d_backing_inode(dentry); + struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 334b1bdd572c..972d9a8e4ac4 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -306,10 +306,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; ARG1 = (u64) (unsigned long) ctx; - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - select_insn: goto *jumptable[insn->code]; diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 34777b3746fa..c5b30fd8a315 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -14,11 +14,15 @@ #include <linux/filter.h> #include <linux/vmalloc.h> +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; + struct bpf_htab { struct bpf_map map; - struct hlist_head *buckets; - raw_spinlock_t lock; - u32 count; /* number of elements in this hashtable */ + struct bucket *buckets; + atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; @@ -79,34 +83,35 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + htab->n_buckets > U32_MAX / sizeof(struct bucket)) goto free_htab; - if ((u64) htab->n_buckets * sizeof(struct hlist_head) + + if ((u64) htab->n_buckets * sizeof(struct bucket) + (u64) htab->elem_size * htab->map.max_entries >= U32_MAX - PAGE_SIZE) /* make sure page count doesn't overflow */ goto free_htab; - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + + htab->map.pages = round_up(htab->n_buckets * sizeof(struct bucket) + htab->elem_size * htab->map.max_entries, PAGE_SIZE) >> PAGE_SHIFT; err = -ENOMEM; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); if (!htab->buckets) goto free_htab; } - for (i = 0; i < htab->n_buckets; i++) - INIT_HLIST_HEAD(&htab->buckets[i]); + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } - raw_spin_lock_init(&htab->lock); - htab->count = 0; + atomic_set(&htab->count, 0); return &htab->map; @@ -120,11 +125,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len) return jhash(key, key_len, 0); } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { @@ -227,6 +237,7 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct htab_elem *l_new, *l_old; struct hlist_head *head; + struct bucket *b; unsigned long flags; u32 key_size; int ret; @@ -248,15 +259,15 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); l_new->hash = htab_map_hash(l_new->key, key_size); + b = __select_bucket(htab, l_new->hash); + head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, l_new->hash); + raw_spin_lock_irqsave(&b->lock, flags); l_old = lookup_elem_raw(head, l_new->hash, key, key_size); - if (!l_old && unlikely(htab->count >= map->max_entries)) { + if (!l_old && unlikely(atomic_read(&htab->count) >= map->max_entries)) { /* if elem with this 'key' doesn't exist and we've reached * max_entries limit, fail insertion of new elem */ @@ -284,13 +295,13 @@ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, hlist_del_rcu(&l_old->hash_node); kfree_rcu(l_old, rcu); } else { - htab->count++; + atomic_inc(&htab->count); } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return 0; err: - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); kfree(l_new); return ret; } @@ -300,6 +311,7 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; + struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; @@ -310,21 +322,21 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, hash); + raw_spin_lock_irqsave(&b->lock, flags); l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree_rcu(l, rcu); ret = 0; } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } @@ -339,7 +351,7 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab->count--; + atomic_dec(&htab->count); kfree(l); } } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5a8a797d50b7..f2ece3c174a5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } +static int bpf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_link(old_dentry, dir, new_dentry); +} + +static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations bpf_dir_iops = { .lookup = simple_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, + .rename = bpf_rename, + .link = bpf_link, .unlink = simple_unlink, }; diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 3b39550d8485..637397059f76 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -113,8 +113,28 @@ static int bpf_map_release(struct inode *inode, struct file *filp) return 0; } +#ifdef CONFIG_PROC_FS +static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_map *map = filp->private_data; + + seq_printf(m, + "map_type:\t%u\n" + "key_size:\t%u\n" + "value_size:\t%u\n" + "max_entries:\t%u\n", + map->map_type, + map->key_size, + map->value_size, + map->max_entries); +} +#endif + static const struct file_operations bpf_map_fops = { - .release = bpf_map_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_map_show_fdinfo, +#endif + .release = bpf_map_release, }; int bpf_map_new_fd(struct bpf_map *map) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index a7945d10b378..d1d3e8f57de9 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -1121,6 +1121,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 470f6536b9e8..c03a640ef6da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,8 +57,8 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> #include <linux/delay.h> - #include <linux/atomic.h> +#include <net/sock.h> /* * pidlists linger the following amount before being destroyed. The goal @@ -211,6 +211,7 @@ static unsigned long have_free_callback __read_mostly; /* Ditto for the can_fork callback. */ static unsigned long have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; @@ -440,11 +441,6 @@ static bool cgroup_tryget(struct cgroup *cgrp) return css_tryget(&cgrp->self); } -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -465,25 +461,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -/** - * cgroup_is_descendant - test ancestry - * @cgrp: the cgroup to be tested - * @ancestor: possible ancestor of @cgrp - * - * Test whether @cgrp is a descendant of @ancestor. It also returns %true - * if @cgrp == @ancestor. This function is safe to call as long as @cgrp - * and @ancestor are accessible. - */ -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) -{ - while (cgrp) { - if (cgrp == ancestor) - return true; - cgrp = cgroup_parent(cgrp); - } - return false; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -1647,10 +1624,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } - if (!strcmp(token, "__DEVEL__sane_behavior")) { - opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; - continue; - } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1717,15 +1690,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (nr_opts != 1) { - pr_err("sane_behavior: no other mount options allowed\n"); - return -EINVAL; - } - return 0; - } - /* * If the 'all' option was specified select all the subsystems, * otherwise if 'none', 'name=' and a subsystem name options were @@ -1924,6 +1888,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) if (ret < 0) goto out; root_cgrp->id = ret; + root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); @@ -2004,6 +1969,7 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; struct cgroup_subsys *ss; struct cgroup_root *root; @@ -2020,6 +1986,17 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_root_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } + mutex_lock(&cgroup_mutex); /* First find the desired set of subsystems */ @@ -2027,15 +2004,6 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (ret) goto out_unlock; - /* look for a matching existing root */ - if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { - cgrp_dfl_root_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - ret = 0; - goto out_unlock; - } - /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the @@ -2146,9 +2114,10 @@ out_free: if (ret) return ERR_PTR(ret); - +out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2191,6 +2160,12 @@ static struct file_system_type cgroup_fs_type = { .kill_sb = cgroup_kill_sb, }; +static struct file_system_type cgroup2_fs_type = { + .name = "cgroup2", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, +}; + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -4062,7 +4037,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) goto out_err; /* - * Migrate tasks one-by-one until @form is empty. This fails iff + * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { @@ -4903,11 +4878,11 @@ err_free_css: static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, umode_t mode) { - struct cgroup *parent, *cgrp; + struct cgroup *parent, *cgrp, *tcgrp; struct cgroup_root *root; struct cgroup_subsys *ss; struct kernfs_node *kn; - int ssid, ret; + int level, ssid, ret; /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. */ @@ -4918,9 +4893,11 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (!parent) return -ENODEV; root = parent->root; + level = parent->level + 1; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); + cgrp = kzalloc(sizeof(*cgrp) + + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); if (!cgrp) { ret = -ENOMEM; goto out_unlock; @@ -4944,6 +4921,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->self.parent = &parent->self; cgrp->root = root; + cgrp->level = level; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -5188,7 +5169,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; - printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + pr_debug("Initializing cgroup subsys %s\n", ss->name); mutex_lock(&cgroup_mutex); @@ -5346,6 +5327,7 @@ int __init cgroup_init(void) WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); return 0; @@ -5489,19 +5471,6 @@ static const struct file_operations proc_cgroupstats_operations = { .release = single_release, }; -static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) - return &ss_priv[i - CGROUP_CANFORK_START]; - return NULL; -} - -static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - void **private = subsys_canfork_priv_p(ss_priv, i); - return private ? *private : NULL; -} - /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5524,14 +5493,13 @@ void cgroup_fork(struct task_struct *child) * returns an error, the fork aborts with that error code. This allows for * a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +int cgroup_can_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i, j, ret; for_each_subsys_which(ss, i, &have_canfork_callback) { - ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); + ret = ss->can_fork(child); if (ret) goto out_revert; } @@ -5543,7 +5511,7 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); + ss->cancel_fork(child); } return ret; @@ -5556,15 +5524,14 @@ out_revert: * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeded. */ -void cgroup_cancel_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_cancel_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); + ss->cancel_fork(child); } /** @@ -5577,8 +5544,7 @@ void cgroup_cancel_fork(struct task_struct *child, * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ -void cgroup_post_fork(struct task_struct *child, - void *old_ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; @@ -5622,7 +5588,7 @@ void cgroup_post_fork(struct task_struct *child, * and addition to css_set. */ for_each_subsys_which(ss, i, &have_fork_callback) - ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); + ss->fork(child); } /** @@ -5822,6 +5788,93 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } +/** + * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path + * @path: path on the default hierarchy + * + * Find the cgroup at @path on the default hierarchy, increment its + * reference count and return it. Returns pointer to the found cgroup on + * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * if @path points to a non-directory. + */ +struct cgroup *cgroup_get_from_path(const char *path) +{ + struct kernfs_node *kn; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + + kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + if (kn) { + if (kernfs_type(kn) == KERNFS_DIR) { + cgrp = kn->priv; + cgroup_get(cgrp); + } else { + cgrp = ERR_PTR(-ENOTDIR); + } + kernfs_put(kn); + } else { + cgrp = ERR_PTR(-ENOENT); + } + + mutex_unlock(&cgroup_mutex); + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_path); + +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + +DEFINE_SPINLOCK(cgroup_sk_update_lock); +static bool cgroup_sk_alloc_disabled __read_mostly; + +void cgroup_sk_alloc_disable(void) +{ + if (cgroup_sk_alloc_disabled) + return; + pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); + cgroup_sk_alloc_disabled = true; +} + +#else + +#define cgroup_sk_alloc_disabled false + +#endif + +void cgroup_sk_alloc(struct sock_cgroup_data *skcd) +{ + if (cgroup_sk_alloc_disabled) + return; + + rcu_read_lock(); + + while (true) { + struct css_set *cset; + + cset = task_css_set(current); + if (likely(cgroup_tryget(cset->dfl_cgrp))) { + skcd->val = (unsigned long)cset->dfl_cgrp; + break; + } + cpu_relax(); + } + + rcu_read_unlock(); +} + +void cgroup_sk_free(struct sock_cgroup_data *skcd) +{ + cgroup_put(sock_cgroup_ptr(skcd)); +} + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index 2d3df82c54f2..1b72d56edce5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -200,7 +200,7 @@ static void freezer_attach(struct cgroup_taskset *tset) * to do anything as freezer_attach() will put @task into the appropriate * state. */ -static void freezer_fork(struct task_struct *task, void *private) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index b50d5a167fda..303097b37429 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -134,7 +134,7 @@ static void pids_charge(struct pids_cgroup *pids, int num) * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge - * succeded, otherwise -EAGAIN. + * succeeded, otherwise -EAGAIN. */ static int pids_try_charge(struct pids_cgroup *pids, int num) { @@ -209,7 +209,7 @@ static void pids_cancel_attach(struct cgroup_taskset *tset) * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies * on threadgroup_change_begin() held by the copy_process(). */ -static int pids_can_fork(struct task_struct *task, void **priv_p) +static int pids_can_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; @@ -219,7 +219,7 @@ static int pids_can_fork(struct task_struct *task, void **priv_p) return pids_try_charge(pids, 1); } -static void pids_cancel_fork(struct task_struct *task, void *priv) +static void pids_cancel_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; diff --git a/kernel/cpu.c b/kernel/cpu.c index 85ff5e26e23b..5b9d39633ce9 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -759,71 +759,33 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly - = CPU_BITS_ALL; +struct cpumask __cpu_possible_mask __read_mostly + = {CPU_BITS_ALL}; #else -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +struct cpumask __cpu_possible_mask __read_mostly; #endif -const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); -EXPORT_SYMBOL(cpu_possible_mask); +EXPORT_SYMBOL(__cpu_possible_mask); -static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); -EXPORT_SYMBOL(cpu_online_mask); +struct cpumask __cpu_online_mask __read_mostly; +EXPORT_SYMBOL(__cpu_online_mask); -static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); -EXPORT_SYMBOL(cpu_present_mask); +struct cpumask __cpu_present_mask __read_mostly; +EXPORT_SYMBOL(__cpu_present_mask); -static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); -EXPORT_SYMBOL(cpu_active_mask); - -void set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); -} - -void set_cpu_present(unsigned int cpu, bool present) -{ - if (present) - cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); -} - -void set_cpu_online(unsigned int cpu, bool online) -{ - if (online) { - cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); - } else { - cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); - } -} - -void set_cpu_active(unsigned int cpu, bool active) -{ - if (active) - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); -} +struct cpumask __cpu_active_mask __read_mostly; +EXPORT_SYMBOL(__cpu_active_mask); void init_cpu_present(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_present_bits), src); + cpumask_copy(&__cpu_present_mask, src); } void init_cpu_possible(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_possible_bits), src); + cpumask_copy(&__cpu_possible_mask, src); } void init_cpu_online(const struct cpumask *src) { - cpumask_copy(to_cpumask(cpu_online_bits), src); + cpumask_copy(&__cpu_online_mask, src); } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 02a8ea5c9963..3e945fcd8179 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -51,6 +51,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/time.h> +#include <linux/time64.h> #include <linux/backing-dev.h> #include <linux/sort.h> @@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ - time_t time; /* clock (secs) when val computed */ + time64_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */ }; @@ -1374,7 +1375,7 @@ out: */ #define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ #define FM_SCALE 1000 /* faux fixed point scale */ @@ -1390,8 +1391,11 @@ static void fmeter_init(struct fmeter *fmp) /* Internal meter update - process cnt events and update value */ static void fmeter_update(struct fmeter *fmp) { - time_t now = get_seconds(); - time_t ticks = now - fmp->time; + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; if (ticks == 0) return; diff --git a/kernel/cred.c b/kernel/cred.c index 71179a09c1d6..0c0cd8a62285 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds); void __init cred_init(void) { /* allocate a slab in which we can store credentials */ - cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); } /** diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4121345498e0..2a20c0dfdafc 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv) continue; kdb_printf("%-20s%8u 0x%p ", mod->name, - mod->core_size, (void *)mod); + mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); #endif @@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->module_core); + kdb_printf(" 0x%p", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ef90b04d783f..435c14a45118 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { - delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); + delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); } diff --git a/kernel/events/core.c b/kernel/events/core.c index bf8244190d0f..06ae52e99ac2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3376,7 +3376,7 @@ find_lively_task_by_vpid(pid_t vpid) /* Reuse ptrace permission checks for now. */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto errout; return task; @@ -4872,9 +4872,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval < 0) return retval; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 7dad84913abf..0167679182c0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + false); if (err) return err; @@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, goto unlock; get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr); - mem_cgroup_commit_charge(kpage, memcg, false); + page_add_new_anon_rmap(kpage, vma, addr, false); + mem_cgroup_commit_charge(kpage, memcg, false, false); lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); pte_unmap_unlock(ptep, ptl); @@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg); + mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; diff --git a/kernel/exit.c b/kernel/exit.c index 07110c6020a0..10e088237fed 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -59,8 +59,6 @@ #include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct *tsk); - static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -1120,8 +1118,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p) && - !(p->jobctl & JOBCTL_LISTENING)) + if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) diff --git a/kernel/fork.c b/kernel/fork.c index 291b08cc817b..2e391c754ae7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -300,9 +300,9 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", arch_task_struct_size, - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + task_struct_cachep = kmem_cache_create("task_struct", + arch_task_struct_size, ARCH_MIN_TASKALIGN, + SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -414,7 +414,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; - mm->shared_vm = oldmm->shared_vm; + mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; @@ -433,8 +433,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -vma_pages(mpnt)); + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; @@ -1250,7 +1249,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1527,7 +1525,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p, cgrp_ss_priv); + retval = cgroup_can_fork(p); if (retval) goto bad_fork_free_pid; @@ -1609,7 +1607,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p, cgrp_ss_priv); + cgroup_post_fork(p); threadgroup_change_end(current); perf_event_fork(p); @@ -1619,7 +1617,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; bad_fork_cancel_cgroup: - cgroup_cancel_fork(p, cgrp_ss_priv); + cgroup_cancel_fork(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); @@ -1849,16 +1847,19 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); + SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the * whole struct cpumask for the OFFSTACK case. We could change @@ -1868,8 +1869,9 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } diff --git a/kernel/futex.c b/kernel/futex.c index 8a310e240cda..0773f2b23b10 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page, *page_head; + struct page *page; + struct address_space *mapping; int err, ro = 0; /* @@ -519,46 +520,9 @@ again: else err = 0; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - page_head = page; - if (unlikely(PageTail(page))) { - put_page(page); - /* serialize against __split_huge_page_splitting() */ - local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { - page_head = compound_head(page); - /* - * page_head is valid pointer but we must pin - * it before taking the PG_lock and/or - * PG_compound_lock. The moment we re-enable - * irqs __split_huge_page_splitting() can - * return and the head page can be freed from - * under us. We can't take the PG_lock and/or - * PG_compound_lock on a page that could be - * freed from under us. - */ - if (page != page_head) { - get_page(page_head); - put_page(page); - } - local_irq_enable(); - } else { - local_irq_enable(); - goto again; - } - } -#else - page_head = compound_head(page); - if (page != page_head) { - get_page(page_head); - put_page(page); - } -#endif - - lock_page(page_head); - + lock_page(page); /* - * If page_head->mapping is NULL, then it cannot be a PageAnon + * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast @@ -570,12 +534,13 @@ again: * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page_head->mapping. + * an unlikely race, but we do need to retry for page->mapping. */ - if (!page_head->mapping) { - int shmem_swizzled = PageSwapCache(page_head); - unlock_page(page_head); - put_page(page_head); + mapping = compound_head(page)->mapping; + if (!mapping) { + int shmem_swizzled = PageSwapCache(page); + unlock_page(page); + put_page(page); if (shmem_swizzled) goto again; return -EFAULT; @@ -588,7 +553,7 @@ again: * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page_head)) { + if (PageAnon(page)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. @@ -603,15 +568,15 @@ again: key->private.address = address; } else { key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page_head->mapping->host; + key->shared.inode = mapping->host; key->shared.pgoff = basepage_index(page); } get_futex_key_refs(key); /* implies MB (B) */ out: - unlock_page(page_head); - put_page(page_head); + unlock_page(page); + put_page(page); return err; } @@ -639,7 +604,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) down_read(&mm->mmap_sem); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE, NULL); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; @@ -2919,7 +2884,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 55c8c9349cfe..4ae3232e7a28 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 7080ae1eb6c1..2f9df37940a0 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -123,11 +123,6 @@ void gcov_enable_events(void) } #ifdef CONFIG_MODULES -static inline int within(void *addr, void *start, unsigned long size) -{ - return ((addr >= start) && (addr < start + size)); -} - /* Update list and generate events when modules are unloaded. */ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, void *data) @@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within(info, mod->module_core, mod->core_size)) { + if (within_module((unsigned long)info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index d75179735a28..3e56d2f03e24 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -1066,6 +1066,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, __irq_set_handler(virq, handler, 0, handler_name); irq_set_handler_data(virq, handler_data); } +EXPORT_SYMBOL(irq_domain_set_info); /** * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 15b249e7c673..38e89ce7b071 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, if (irq_find_mapping(domain, hwirq) > 0) return -EEXIST; - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret < 0) - return ret; + if (domain->parent) { + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + } for (i = 0; i < nr_irqs; i++) { ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 0aa69ea1d8fd..3a47fa998fe0 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, &task2->signal->cred_guard_mutex); if (ret) goto err; - if (!ptrace_may_access(task1, PTRACE_MODE_READ) || - !ptrace_may_access(task2, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || + !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } diff --git a/kernel/kexec.c b/kernel/kexec.c index d873b64fbddc..ee70aef5cd81 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -63,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (ret) goto out_free_image; - ret = sanity_check_segment_list(image); - if (ret) - goto out_free_image; - - /* Enable the special crash kernel control page allocation policy. */ if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index c823f3001e12..8dc659144869 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -310,12 +310,9 @@ static void kimage_free_pages(struct page *page) void kimage_free_page_list(struct list_head *list) { - struct list_head *pos, *next; + struct page *page, *next; - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); + list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index b70ada0028d2..007b791f676d 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -109,11 +109,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return -EINVAL; } +#ifdef CONFIG_KEXEC_VERIFY_SIG int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { return -EKEYREJECTED; } +#endif /* Apply relocations of type RELA */ int __weak diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index e4392a698ad4..0a52315d9c62 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image, extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long mem; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; + void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index db545cbcdb89..bc2c85c064c1 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -28,6 +28,7 @@ #include <linux/list.h> #include <linux/kallsyms.h> #include <linux/livepatch.h> +#include <asm/cacheflush.h> /** * struct klp_ops - structure for tracking registered ftrace ops structs @@ -135,13 +136,8 @@ struct klp_find_arg { const char *objname; const char *name; unsigned long addr; - /* - * If count == 0, the symbol was not found. If count == 1, a unique - * match was found and addr is set. If count > 1, there is - * unresolvable ambiguity among "count" number of symbols with the same - * name in the same object. - */ unsigned long count; + unsigned long pos; }; static int klp_find_callback(void *data, const char *name, @@ -158,37 +154,48 @@ static int klp_find_callback(void *data, const char *name, if (args->objname && strcmp(args->objname, mod->name)) return 0; - /* - * args->addr might be overwritten if another match is found - * but klp_find_object_symbol() handles this and only returns the - * addr if count == 1. - */ args->addr = addr; args->count++; + /* + * Finish the search when the symbol is found for the desired position + * or the position is not defined for a non-unique symbol. + */ + if ((args->pos && (args->count == args->pos)) || + (!args->pos && (args->count > 1))) + return 1; + return 0; } static int klp_find_object_symbol(const char *objname, const char *name, - unsigned long *addr) + unsigned long sympos, unsigned long *addr) { struct klp_find_arg args = { .objname = objname, .name = name, .addr = 0, - .count = 0 + .count = 0, + .pos = sympos, }; mutex_lock(&module_mutex); kallsyms_on_each_symbol(klp_find_callback, &args); mutex_unlock(&module_mutex); - if (args.count == 0) + /* + * Ensure an address was found. If sympos is 0, ensure symbol is unique; + * otherwise ensure the symbol position count matches sympos. + */ + if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); - else if (args.count > 1) + else if (args.count > 1 && sympos == 0) { pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", args.count, name, objname); - else { + } else if (sympos != args.count && sympos > 0) { + pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", + sympos, name, objname ? objname : "vmlinux"); + } else { *addr = args.addr; return 0; } @@ -197,66 +204,6 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -struct klp_verify_args { - const char *name; - const unsigned long addr; -}; - -static int klp_verify_callback(void *data, const char *name, - struct module *mod, unsigned long addr) -{ - struct klp_verify_args *args = data; - - if (!mod && - !strcmp(args->name, name) && - args->addr == addr) - return 1; - - return 0; -} - -static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) -{ - struct klp_verify_args args = { - .name = name, - .addr = addr, - }; - int ret; - - mutex_lock(&module_mutex); - ret = kallsyms_on_each_symbol(klp_verify_callback, &args); - mutex_unlock(&module_mutex); - - if (!ret) { - pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n", - name, addr); - return -EINVAL; - } - - return 0; -} - -static int klp_find_verify_func_addr(struct klp_object *obj, - struct klp_func *func) -{ - int ret; - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old_addr accordingly */ - if (kaslr_enabled() && func->old_addr) - func->old_addr += kaslr_offset(); -#endif - - if (!func->old_addr || klp_is_module(obj)) - ret = klp_find_object_symbol(obj->name, func->old_name, - &func->old_addr); - else - ret = klp_verify_vmlinux_symbol(func->old_name, - func->old_addr); - - return ret; -} - /* * external symbols are located outside the parent object (where the parent * object is either vmlinux or the kmod being patched). @@ -276,14 +223,18 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, } preempt_enable(); - /* otherwise check if it's in another .o within the patch module */ - return klp_find_object_symbol(pmod->name, name, addr); + /* + * Check if it's in another .o within the patch module. This also + * checks that the external symbol is unique. + */ + return klp_find_object_symbol(pmod->name, name, 0, addr); } static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { - int ret; + int ret = 0; + unsigned long val; struct klp_reloc *reloc; if (WARN_ON(!klp_is_object_loaded(obj))) @@ -292,41 +243,38 @@ static int klp_write_object_relocations(struct module *pmod, if (WARN_ON(!obj->relocs)) return -EINVAL; + module_disable_ro(pmod); + for (reloc = obj->relocs; reloc->name; reloc++) { - if (!klp_is_module(obj)) { - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old value accordingly */ - if (kaslr_enabled()) - reloc->val += kaslr_offset(); -#endif - ret = klp_verify_vmlinux_symbol(reloc->name, - reloc->val); - if (ret) - return ret; - } else { - /* module, reloc->val needs to be discovered */ - if (reloc->external) - ret = klp_find_external_symbol(pmod, - reloc->name, - &reloc->val); - else - ret = klp_find_object_symbol(obj->mod->name, - reloc->name, - &reloc->val); - if (ret) - return ret; - } + /* discover the address of the referenced symbol */ + if (reloc->external) { + if (reloc->sympos > 0) { + pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", + reloc->name); + ret = -EINVAL; + goto out; + } + ret = klp_find_external_symbol(pmod, reloc->name, &val); + } else + ret = klp_find_object_symbol(obj->name, + reloc->name, + reloc->sympos, + &val); + if (ret) + goto out; + ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, - reloc->val + reloc->addend); + val + reloc->addend); if (ret) { pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", - reloc->name, reloc->val, ret); - return ret; + reloc->name, val, ret); + goto out; } } - return 0; +out: + module_enable_ro(pmod); + return ret; } static void notrace klp_ftrace_handler(unsigned long ip, @@ -593,7 +541,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled * /sys/kernel/livepatch/<patch>/<object> - * /sys/kernel/livepatch/<patch>/<object>/<func> + * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -738,8 +686,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) INIT_LIST_HEAD(&func->stack_node); func->state = KLP_DISABLED; + /* The format for the sysfs directory is <function,sympos> where sympos + * is the nth occurrence of this symbol in kallsyms for the patched + * object. If the user selects 0 for old_sympos, then 1 will be used + * since a unique symbol will be the first occurrence. + */ return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s", func->old_name); + &obj->kobj, "%s,%lu", func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* parts of the initialization that is done only when the object is loaded */ @@ -756,7 +710,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, } klp_for_each_func(obj, func) { - ret = klp_find_verify_func_addr(obj, func); + ret = klp_find_object_symbol(obj->name, func->old_name, + func->old_sympos, + &func->old_addr); if (ret) return ret; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 7658d32c5c78..e517a16cb426 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -10,8 +10,11 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include <linux/radix-tree.h> +#include <linux/memremap.h> #include <linux/device.h> #include <linux/types.h> +#include <linux/pfn_t.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> @@ -147,24 +150,127 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); +pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +{ + return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); +} +EXPORT_SYMBOL(phys_to_pfn_t); + #ifdef CONFIG_ZONE_DEVICE +static DEFINE_MUTEX(pgmap_lock); +static RADIX_TREE(pgmap_radix, GFP_KERNEL); +#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) +#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) + struct page_map { struct resource res; + struct percpu_ref *ref; + struct dev_pagemap pgmap; + struct vmem_altmap altmap; }; -static void devm_memremap_pages_release(struct device *dev, void *res) +void get_zone_device_page(struct page *page) +{ + percpu_ref_get(page->pgmap->ref); +} +EXPORT_SYMBOL(get_zone_device_page); + +void put_zone_device_page(struct page *page) +{ + put_dev_pagemap(page->pgmap); +} +EXPORT_SYMBOL(put_zone_device_page); + +static void pgmap_radix_release(struct resource *res) +{ + resource_size_t key; + + mutex_lock(&pgmap_lock); + for (key = res->start; key <= res->end; key += SECTION_SIZE) + radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); + mutex_unlock(&pgmap_lock); +} + +static unsigned long pfn_first(struct page_map *page_map) +{ + struct dev_pagemap *pgmap = &page_map->pgmap; + const struct resource *res = &page_map->res; + struct vmem_altmap *altmap = pgmap->altmap; + unsigned long pfn; + + pfn = res->start >> PAGE_SHIFT; + if (altmap) + pfn += vmem_altmap_offset(altmap); + return pfn; +} + +static unsigned long pfn_end(struct page_map *page_map) +{ + const struct resource *res = &page_map->res; + + return (res->start + resource_size(res)) >> PAGE_SHIFT; +} + +#define for_each_device_pfn(pfn, map) \ + for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) + +static void devm_memremap_pages_release(struct device *dev, void *data) { - struct page_map *page_map = res; + struct page_map *page_map = data; + struct resource *res = &page_map->res; + resource_size_t align_start, align_size; + struct dev_pagemap *pgmap = &page_map->pgmap; + + if (percpu_ref_tryget_live(pgmap->ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(pgmap->ref); + } + + pgmap_radix_release(res); /* pages are dead and unused, undo the arch mapping */ - arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + arch_remove_memory(align_start, align_size); + dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, + "%s: failed to free all reserved pages\n", __func__); +} + +/* assumes rcu_read_lock() held at entry */ +struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +{ + struct page_map *page_map; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); + return page_map ? &page_map->pgmap : NULL; } -void *devm_memremap_pages(struct device *dev, struct resource *res) +/** + * devm_memremap_pages - remap and provide memmap backing for the given resource + * @dev: hosting device for @res + * @res: "host memory" address range + * @ref: a live per-cpu reference count + * @altmap: optional descriptor for allocating the memmap from @res + * + * Notes: + * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time + * (or devm release event). + * + * 2/ @res is expected to be a host memory range that could feasibly be + * treated as a "System RAM" range, i.e. not a device mmio range, but + * this is not enforced. + */ +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct percpu_ref *ref, struct vmem_altmap *altmap) { int is_ram = region_intersects(res->start, resource_size(res), "System RAM"); + resource_size_t key, align_start, align_size; + struct dev_pagemap *pgmap; struct page_map *page_map; + unsigned long pfn; int error, nid; if (is_ram == REGION_MIXED) { @@ -176,25 +282,120 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (is_ram == REGION_INTERSECTS) return __va(res->start); + if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { + dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", + __func__); + return ERR_PTR(-ENXIO); + } + + if (!ref) + return ERR_PTR(-EINVAL); + page_map = devres_alloc_node(devm_memremap_pages_release, sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) return ERR_PTR(-ENOMEM); + pgmap = &page_map->pgmap; memcpy(&page_map->res, res, sizeof(*res)); + pgmap->dev = dev; + if (altmap) { + memcpy(&page_map->altmap, altmap, sizeof(*altmap)); + pgmap->altmap = &page_map->altmap; + } + pgmap->ref = ref; + pgmap->res = &page_map->res; + + mutex_lock(&pgmap_lock); + error = 0; + for (key = res->start; key <= res->end; key += SECTION_SIZE) { + struct dev_pagemap *dup; + + rcu_read_lock(); + dup = find_dev_pagemap(key); + rcu_read_unlock(); + if (dup) { + dev_err(dev, "%s: %pr collides with mapping for %s\n", + __func__, res, dev_name(dup->dev)); + error = -EBUSY; + break; + } + error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, + page_map); + if (error) { + dev_err(dev, "%s: failed: %d\n", __func__, error); + break; + } + } + mutex_unlock(&pgmap_lock); + if (error) + goto err_radix; + nid = dev_to_node(dev); if (nid < 0) nid = numa_mem_id(); - error = arch_add_memory(nid, res->start, resource_size(res), true); - if (error) { - devres_free(page_map); - return ERR_PTR(error); - } + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + error = arch_add_memory(nid, align_start, align_size, true); + if (error) + goto err_add_memory; + for_each_device_pfn(pfn, page_map) { + struct page *page = pfn_to_page(pfn); + + /* ZONE_DEVICE pages must never appear on a slab lru */ + list_force_poison(&page->lru); + page->pgmap = pgmap; + } devres_add(dev, page_map); return __va(res->start); + + err_add_memory: + err_radix: + pgmap_radix_release(res); + devres_free(page_map); + return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + return altmap->reserve + altmap->free; +} + +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + /* + * 'memmap_start' is the virtual address for the first "struct + * page" in this range of the vmemmap array. In the case of + * CONFIG_SPARSE_VMEMMAP a page_to_pfn conversion is simple + * pointer arithmetic, so we can perform this to_vmem_altmap() + * conversion without concern for the initialization state of + * the struct page fields. + */ + struct page *page = (struct page *) memmap_start; + struct dev_pagemap *pgmap; + + /* + * Uncoditionally retrieve a dev_pagemap associated with the + * given physical address, this is only for use in the + * arch_{add|remove}_memory() for setting up and tearing down + * the memmap. + */ + rcu_read_lock(); + pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); + rcu_read_unlock(); + + return pgmap ? pgmap->altmap : NULL; +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module.c b/kernel/module.c index 38c7bd5583ff..8358f4697c0c 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -80,15 +80,6 @@ # define debug_align(X) (X) #endif -/* - * Given BASE and SIZE this macro calculates the number of pages the - * memory regions occupies - */ -#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ - (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ - PFN_DOWN((unsigned long)BASE) + 1) \ - : (0UL)) - /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) @@ -108,13 +99,6 @@ static LIST_HEAD(modules); * Use a latched RB-tree for __module_address(); this allows us to use * RCU-sched lookups of the address from any context. * - * Because modules have two address ranges: init and core, we need two - * latch_tree_nodes entries. Therefore we need the back-pointer from - * mod_tree_node. - * - * Because init ranges are short lived we mark them unlikely and have placed - * them outside the critical cacheline in struct module. - * * This is conditional on PERF_EVENTS || TRACING because those can really hit * __module_address() hard by doing a lot of stack unwinding; potentially from * NMI context. @@ -122,24 +106,16 @@ static LIST_HEAD(modules); static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->module_init; - - return (unsigned long)mod->module_core; + return (unsigned long)layout->base; } static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; - - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->init_size; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - return (unsigned long)mod->core_size; + return (unsigned long)layout->size; } static __always_inline bool @@ -197,23 +173,23 @@ static void __mod_tree_remove(struct mod_tree_node *node) */ static void mod_tree_insert(struct module *mod) { - mod->mtn_core.mod = mod; - mod->mtn_init.mod = mod; + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; - __mod_tree_insert(&mod->mtn_core); - if (mod->init_size) - __mod_tree_insert(&mod->mtn_init); + __mod_tree_insert(&mod->core_layout.mtn); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn); } static void mod_tree_remove_init(struct module *mod) { - if (mod->init_size) - __mod_tree_remove(&mod->mtn_init); + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn); } static void mod_tree_remove(struct module *mod) { - __mod_tree_remove(&mod->mtn_core); + __mod_tree_remove(&mod->core_layout.mtn); mod_tree_remove_init(mod); } @@ -267,9 +243,9 @@ static void __mod_update_bounds(void *base, unsigned int size) static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->module_core, mod->core_size); - if (mod->init_size) - __mod_update_bounds(mod->module_init, mod->init_size); + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + if (mod->init_layout.size) + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); } #ifdef CONFIG_KGDB_KDB @@ -1214,7 +1190,7 @@ struct module_attribute module_uevent = static ssize_t show_coresize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->core_size); + return sprintf(buffer, "%u\n", mk->mod->core_layout.size); } static struct module_attribute modinfo_coresize = @@ -1223,7 +1199,7 @@ static struct module_attribute modinfo_coresize = static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->init_size); + return sprintf(buffer, "%u\n", mk->mod->init_layout.size); } static struct module_attribute modinfo_initsize = @@ -1873,64 +1849,75 @@ static void mod_sysfs_teardown(struct module *mod) /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [writable data] + * text_size -----^ ^ ^ + * ro_size ------------------------| | + * size -------------------------------------------| + * + * These values are always page-aligned (as is base) */ -void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - unsigned long begin_pfn = PFN_DOWN((unsigned long)start); - unsigned long end_pfn = PFN_DOWN((unsigned long)end); + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base, + layout->text_size >> PAGE_SHIFT); +} - if (end_pfn > begin_pfn) - set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } -static void set_section_ro_nx(void *base, - unsigned long text_size, - unsigned long ro_size, - unsigned long total_size) +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - /* begin and end PFNs of the current subsection */ - unsigned long begin_pfn; - unsigned long end_pfn; + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->size - layout->ro_size) >> PAGE_SHIFT); +} - /* - * Set RO for module text and RO-data: - * - Always protect first page. - * - Do not protect last partial page. - */ - if (ro_size > 0) - set_page_attributes(base, base + ro_size, set_memory_ro); +/* livepatching wants to disable read-only so it can frob module. */ +void module_disable_ro(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); + frob_rodata(&mod->init_layout, set_memory_rw); +} - /* - * Set NX permissions for module data: - * - Do not protect first partial page. - * - Always protect last page. - */ - if (total_size > text_size) { - begin_pfn = PFN_UP((unsigned long)base + text_size); - end_pfn = PFN_UP((unsigned long)base + total_size); - if (end_pfn > begin_pfn) - set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); - } +void module_enable_ro(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_ro); + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); } -static void unset_module_core_ro_nx(struct module *mod) +static void module_enable_nx(const struct module *mod) { - set_page_attributes(mod->module_core + mod->core_text_size, - mod->module_core + mod->core_size, - set_memory_x); - set_page_attributes(mod->module_core, - mod->module_core + mod->core_ro_size, - set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_nx); + frob_writable_data(&mod->core_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); } -static void unset_module_init_ro_nx(struct module *mod) +static void module_disable_nx(const struct module *mod) { - set_page_attributes(mod->module_init + mod->init_text_size, - mod->module_init + mod->init_size, - set_memory_x); - set_page_attributes(mod->module_init, - mod->module_init + mod->init_ro_size, - set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_x); + frob_writable_data(&mod->core_layout, set_memory_x); + frob_rodata(&mod->init_layout, set_memory_x); + frob_writable_data(&mod->init_layout, set_memory_x); } /* Iterate through all modules and set each module's text as RW */ @@ -1942,16 +1929,9 @@ void set_all_modules_text_rw(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, - set_memory_rw); - } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, - set_memory_rw); - } + + frob_text(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); } mutex_unlock(&module_mutex); } @@ -1965,23 +1945,25 @@ void set_all_modules_text_ro(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, - set_memory_ro); - } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, - set_memory_ro); - } + + frob_text(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); } mutex_unlock(&module_mutex); } + +static void disable_ro_nx(const struct module_layout *layout) +{ + frob_text(layout, set_memory_rw); + frob_rodata(layout, set_memory_rw); + frob_rodata(layout, set_memory_x); + frob_writable_data(layout, set_memory_x); +} + #else -static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } -static void unset_module_core_ro_nx(struct module *mod) { } -static void unset_module_init_ro_nx(struct module *mod) { } +static void disable_ro_nx(const struct module_layout *layout) { } +static void module_enable_nx(const struct module *mod) { } +static void module_disable_nx(const struct module *mod) { } #endif void __weak module_memfree(void *module_region) @@ -2033,19 +2015,19 @@ static void free_module(struct module *mod) synchronize_sched(); mutex_unlock(&module_mutex); - /* This may be NULL, but that's OK */ - unset_module_init_ro_nx(mod); + /* This may be empty, but that's OK */ + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); - module_memfree(mod->module_init); + module_memfree(mod->init_layout.base); kfree(mod->args); percpu_modfree(mod); /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ - unset_module_core_ro_nx(mod); - module_memfree(mod->module_core); + disable_ro_nx(&mod->core_layout); + module_memfree(mod->core_layout.base); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2248,20 +2230,20 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || strstarts(sname, ".init")) continue; - s->sh_entsize = get_offset(mod, &mod->core_size, s, i); + s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->core_size = debug_align(mod->core_size); - mod->core_text_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_size = debug_align(mod->core_size); - mod->core_ro_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_size = mod->core_layout.size; break; case 3: /* whole core */ - mod->core_size = debug_align(mod->core_size); + mod->core_layout.size = debug_align(mod->core_layout.size); break; } } @@ -2277,21 +2259,21 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || !strstarts(sname, ".init")) continue; - s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) + s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->init_size = debug_align(mod->init_size); - mod->init_text_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.text_size = mod->init_layout.size; break; case 1: /* RO: text and ro-data */ - mod->init_size = debug_align(mod->init_size); - mod->init_ro_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.ro_size = mod->init_layout.size; break; case 3: /* whole init */ - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = debug_align(mod->init_layout.size); break; } } @@ -2401,7 +2383,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } if (sym->st_shndx == SHN_UNDEF) return 'U'; - if (sym->st_shndx == SHN_ABS) + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) return 'a'; if (sym->st_shndx >= SHN_LORESERVE) return '?'; @@ -2430,7 +2412,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum, unsigned int pcpundx) { const Elf_Shdr *sec; @@ -2439,6 +2421,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, || !src->st_name) return false; +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + sec = sechdrs + src->st_shndx; if (!(sec->sh_flags & SHF_ALLOC) #ifndef CONFIG_KALLSYMS_ALL @@ -2466,7 +2453,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, info->index.sym) | INIT_OFFSET_MASK; pr_debug("\t%s\n", info->secstrings + symsect->sh_name); @@ -2476,23 +2463,24 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_size += strtab_size; - mod->core_size = debug_align(mod->core_size); + info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_layout.size += strtab_size; + mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = debug_align(mod->init_layout.size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); } @@ -2513,12 +2501,13 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) for (i = 0; i < mod->num_symtab; i++) mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - mod->core_symtab = dst = mod->module_core + info->symoffs; - mod->core_strtab = s = mod->module_core + info->stroffs; + mod->core_symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_strtab = s = mod->core_layout.base + info->stroffs; src = mod->symtab; for (ndst = i = 0; i < mod->num_symtab; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { dst[ndst] = src[i]; dst[ndst++].st_name = s - mod->core_strtab; s += strlcpy(s, &mod->strtab[src[i].st_name], @@ -2964,7 +2953,7 @@ static int move_module(struct module *mod, struct load_info *info) void *ptr; /* Do the allocs. */ - ptr = module_alloc(mod->core_size); + ptr = module_alloc(mod->core_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2974,11 +2963,11 @@ static int move_module(struct module *mod, struct load_info *info) if (!ptr) return -ENOMEM; - memset(ptr, 0, mod->core_size); - mod->module_core = ptr; + memset(ptr, 0, mod->core_layout.size); + mod->core_layout.base = ptr; - if (mod->init_size) { - ptr = module_alloc(mod->init_size); + if (mod->init_layout.size) { + ptr = module_alloc(mod->init_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. This block doesn't need to be @@ -2987,13 +2976,13 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_memfree(mod->module_core); + module_memfree(mod->core_layout.base); return -ENOMEM; } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + memset(ptr, 0, mod->init_layout.size); + mod->init_layout.base = ptr; } else - mod->module_init = NULL; + mod->init_layout.base = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); @@ -3005,10 +2994,10 @@ static int move_module(struct module *mod, struct load_info *info) continue; if (shdr->sh_entsize & INIT_OFFSET_MASK) - dest = mod->module_init + dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + shdr->sh_entsize; + dest = mod->core_layout.base + shdr->sh_entsize; if (shdr->sh_type != SHT_NOBITS) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); @@ -3070,12 +3059,12 @@ static void flush_module_icache(const struct module *mod) * Do it before processing of module parameters, so the module * can provide parameter accessor functions of its own. */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); + if (mod->init_layout.base) + flush_icache_range((unsigned long)mod->init_layout.base, + (unsigned long)mod->init_layout.base + + mod->init_layout.size); + flush_icache_range((unsigned long)mod->core_layout.base, + (unsigned long)mod->core_layout.base + mod->core_layout.size); set_fs(old_fs); } @@ -3133,8 +3122,8 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); - module_memfree(mod->module_core); + module_memfree(mod->init_layout.base); + module_memfree(mod->core_layout.base); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3221,7 +3210,7 @@ static noinline int do_init_module(struct module *mod) ret = -ENOMEM; goto fail; } - freeinit->module_init = mod->module_init; + freeinit->module_init = mod->init_layout.base; /* * We want to find out whether @mod uses async during init. Clear @@ -3279,12 +3268,12 @@ static noinline int do_init_module(struct module *mod) mod->strtab = mod->core_strtab; #endif mod_tree_remove_init(mod); - unset_module_init_ro_nx(mod); + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; + mod->init_layout.base = NULL; + mod->init_layout.size = 0; + mod->init_layout.ro_size = 0; + mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we @@ -3373,17 +3362,9 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); + /* Set RO and NX regions */ + module_enable_ro(mod); + module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ @@ -3548,8 +3529,8 @@ static int load_module(struct load_info *info, const char __user *uargs, MODULE_STATE_GOING, mod); /* we can't deallocate the module until we clear memory protection */ - unset_module_init_ro_nx(mod); - unset_module_core_ro_nx(mod); + module_disable_ro(mod); + module_disable_nx(mod); ddebug_cleanup: dynamic_debug_remove(info->debug); @@ -3578,7 +3559,7 @@ static int load_module(struct load_info *info, const char __user *uargs, */ ftrace_release_mod(mod); /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); module_deallocate(mod, info); free_copy: @@ -3656,9 +3637,9 @@ static const char *get_ksymbol(struct module *mod, /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->module_init+mod->init_text_size; + nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; else - nextval = (unsigned long)mod->module_core+mod->core_text_size; + nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ @@ -3905,7 +3886,7 @@ static int m_show(struct seq_file *m, void *p) return 0; seq_printf(m, "%s %u", - mod->name, mod->init_size + mod->core_size); + mod->name, mod->init_layout.size + mod->core_layout.size); print_unload_info(m, mod); /* Informative for users. */ @@ -3914,7 +3895,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%pK", mod->module_core); + seq_printf(m, " 0x%pK", mod->core_layout.base); /* Taints info */ if (mod->taints) @@ -4057,8 +4038,8 @@ struct module *__module_text_address(unsigned long addr) struct module *mod = __module_address(addr); if (mod) { /* Make sure it's within the text section. */ - if (!within(addr, mod->module_init, mod->init_text_size) - && !within(addr, mod->module_core, mod->core_text_size)) + if (!within(addr, mod->init_layout.base, mod->init_layout.text_size) + && !within(addr, mod->core_layout.base, mod->core_layout.text_size)) mod = NULL; } return mod; diff --git a/kernel/panic.c b/kernel/panic.c index b333380c6bb2..d96469de72dc 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -180,8 +180,7 @@ void panic(const char *fmt, ...) * panic() is not being callled from OOPS. */ debug_locks_off(); - console_trylock(); - console_unlock(); + console_flush_on_panic(); if (!panic_blink) panic_blink = no_blink; diff --git a/kernel/pid.c b/kernel/pid.c index 78b3d9f80d44..f4ad91b746f1 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -604,5 +604,5 @@ void __init pidmap_init(void) atomic_dec(&init_pid_ns.pidmap[0].nr_free); init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 02e8dfaa1ce2..68d3ebc12601 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -235,7 +235,7 @@ config PM_TRACE_RTC config APM_EMULATION tristate "Advanced Power Management Emulation" - depends on PM && SYS_SUPPORTS_APM_EMULATION + depends on SYS_SUPPORTS_APM_EMULATION help APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with diff --git a/kernel/power/main.c b/kernel/power/main.c index b2dd4d999900..27946975eff0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; } -static ssize_t pm_wakeup_irq_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} -power_attr(pm_wakeup_irq); +power_attr_ro(pm_wakeup_irq); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} @@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj, return show_trace_dev_match(buf, PAGE_SIZE); } -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); +power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/power/power.h b/kernel/power/power.h index caadb566e82b..efe1b3b17c88 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} + /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 2ce8826f1053..c963ba534a78 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include <linux/uio.h> #include <asm/uaccess.h> +#include <asm-generic/sections.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> @@ -232,7 +233,11 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ -}; +} +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +__packed __aligned(4) +#endif +; /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken @@ -273,11 +278,7 @@ static u32 clear_idx; #define LOG_FACILITY(v) ((v) >> 3 & 0xff) /* record buffer */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define LOG_ALIGN 4 -#else #define LOG_ALIGN __alignof__(struct printk_log) -#endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; @@ -1660,7 +1661,7 @@ asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { - static int recursion_bug; + static bool recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len = 0; @@ -1696,7 +1697,7 @@ asmlinkage int vprintk_emit(int facility, int level, * it can be printed at the next appropriate moment: */ if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = 1; + recursion_bug = true; local_irq_restore(flags); return 0; } @@ -1711,7 +1712,7 @@ asmlinkage int vprintk_emit(int facility, int level, static const char recursion_msg[] = "BUG: recent printk recursion!"; - recursion_bug = 0; + recursion_bug = false; /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, NULL, 0, recursion_msg, @@ -2233,13 +2234,24 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool retry; + bool do_cond_resched, retry; if (console_suspended) { up_console_sem(); return; } + /* + * Console drivers are called under logbuf_lock, so + * @console_may_schedule should be cleared before; however, we may + * end up dumping a lot of lines, for example, if called from + * console registration path, and should invoke cond_resched() + * between lines if allowable. Not doing so can cause a very long + * scheduling stall on a slow console leading to RCU stall and + * softlockup warnings which exacerbate the issue with more + * messages practically incapacitating the system. + */ + do_cond_resched = console_may_schedule; console_may_schedule = 0; /* flush buffered message fragment immediately to console */ @@ -2311,6 +2323,9 @@ skip: call_console_drivers(level, ext_text, ext_len, text, len); start_critical_timings(); local_irq_restore(flags); + + if (do_cond_resched) + cond_resched(); } console_locked = 0; @@ -2378,6 +2393,25 @@ void console_unblank(void) console_unlock(); } +/** + * console_flush_on_panic - flush console content on panic + * + * Immediately output all pending messages no matter what. + */ +void console_flush_on_panic(void) +{ + /* + * If someone else is holding the console lock, trylock will fail + * and may_schedule may be set. Ignore and proceed to unlock so + * that messages are flushed out. As this can be called from any + * context and we don't want to get preempted while flushing, + * ensure may_schedule is cleared. + */ + console_trylock(); + console_may_schedule = 0; + console_unlock(); +} + /* * Return the console tty driver structure and its associated index */ @@ -2658,13 +2692,36 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +/* + * Some boot consoles access data that is in the init section and which will + * be discarded after the initcalls have been run. To make sure that no code + * will access this data, unregister the boot consoles in a late initcall. + * + * If for some reason, such as deferred probe or the driver being a loadable + * module, the real console hasn't registered yet at this point, there will + * be a brief interval in which no messages are logged to the console, which + * makes it difficult to diagnose problems that occur during this time. + * + * To mitigate this problem somewhat, only unregister consoles whose memory + * intersects with the init section. Note that code exists elsewhere to get + * rid of the boot console as soon as the proper console shows up, so there + * won't be side-effects from postponing the removal. + */ static int __init printk_late_init(void) { struct console *con; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { - unregister_console(con); + /* + * Make sure to unregister boot consoles whose data + * resides in the init section before the init section + * is discarded. Boot consoles whose data will stick + * around will automatically be unregistered when the + * proper console replaces them. + */ + if (init_section_intersects(con, sizeof(*con))) + unregister_console(con); } } hotcpu_notifier(console_cpu_notify, 0); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b760bae64cf1..2341efe7fe02 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -219,6 +219,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; + int dumpable = 0; + kuid_t caller_uid; + kgid_t caller_gid; + + if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { + WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); + return -EPERM; + } /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -228,18 +236,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - int dumpable = 0; + /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { + caller_uid = cred->fsuid; + caller_gid = cred->fsgid; + } else { + /* + * Using the euid would make more sense here, but something + * in userland might rely on the old behavior, and this + * shouldn't be a security problem since + * PTRACE_MODE_REALCREDS implies that the caller explicitly + * used a syscall that requests access to another process + * (and not a filesystem syscall to procfs). + */ + caller_uid = cred->uid; + caller_gid = cred->gid; + } tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) + if (uid_eq(caller_uid, tcred->euid) && + uid_eq(caller_uid, tcred->suid) && + uid_eq(caller_uid, tcred->uid) && + gid_eq(caller_gid, tcred->egid) && + gid_eq(caller_gid, tcred->sgid) && + gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; @@ -306,7 +329,7 @@ static int ptrace_attach(struct task_struct *task, long request, goto out; task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); if (retval) goto unlock_creds; @@ -364,8 +387,14 @@ unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: if (!retval) { - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - TASK_UNINTERRUPTIBLE); + /* + * We do not bother to change retval or clear JOBCTL_TRAPPING + * if wait_on_bit() was interrupted by SIGKILL. The tracer will + * not return to user-mode, it will exit and clear this bit in + * __ptrace_unlink() if it wasn't already cleared by the tracee; + * and until then nobody can ptrace this task. + */ + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); proc_ptrace_connector(task, PTRACE_ATTACH); } diff --git a/kernel/relay.c b/kernel/relay.c index 0b4570cfacae..074994bcfa9b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&file_inode(filp)->i_mutex); + inode_lock(file_inode(filp)); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&file_inode(filp)->i_mutex); + inode_unlock(file_inode(filp)); return desc->written; } diff --git a/kernel/resource.c b/kernel/resource.c index f150dbbe6f62..09c0597840b0 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1498,8 +1498,15 @@ int iomem_is_exclusive(u64 addr) break; if (p->end < addr) continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { + /* + * A resource is exclusive if IORESOURCE_EXCLUSIVE is set + * or CONFIG_IO_STRICT_DEVMEM is enabled and the + * resource is busy. + */ + if ((p->flags & IORESOURCE_BUSY) == 0) + continue; + if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) + || p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index caf4041f5b0a..bc54e84675da 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -354,7 +354,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) return; sched_clock_tick(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 77d97a6fc715..63d3a24e081a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -222,9 +222,9 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); - mutex_lock(&inode->i_mutex); + inode_lock(inode); i = sched_feat_set(cmp); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (i == __SCHED_FEAT_NR) return -EINVAL; @@ -8342,7 +8342,7 @@ static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) sched_offline_group(tg); } -static void cpu_cgroup_fork(struct task_struct *task, void *private) +static void cpu_cgroup_fork(struct task_struct *task) { sched_move_task(task); } diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index d5ff5c6bf829..b2ab2ffb1adc 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,9 @@ #include <linux/static_key.h> #include <linux/context_tracking.h> #include "sched.h" +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..544a7133cbd1 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -97,12 +97,6 @@ void default_idle_call(void) static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, int next_state) { - /* Fall back to the default arch idle method on errors. */ - if (next_state < 0) { - default_idle_call(); - return next_state; - } - /* * The idle task must be scheduled, it is pointless to go to idle, just * update no idle residency and return. @@ -168,7 +162,7 @@ static void cpuidle_idle_call(void) */ if (idle_should_freeze()) { entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state >= 0) { + if (entered_state > 0) { local_irq_enable(); goto exit_idle; } @@ -219,6 +213,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d4024f..15a1795bbba1 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index edb6de4f5908..a467e6c28a3b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -529,8 +529,6 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#if defined(CONFIG_SMP) || defined(CONFIG_HOTPLUG_CPU) - static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { @@ -628,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, mutex_unlock(&stop_cpus_mutex); return ret ?: done.ret; } - -#endif /* CONFIG_SMP || CONFIG_HOTPLUG_CPU */ diff --git a/kernel/sys.c b/kernel/sys.c index 6af9212ab5aa..78947de6f969 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1853,11 +1853,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - if (prctl_map.exe_fd != (u32)-1) + if (prctl_map.exe_fd != (u32)-1) { error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); - down_read(&mm->mmap_sem); - if (error) - goto out; + if (error) + return error; + } + + down_write(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -1894,10 +1896,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - error = 0; -out: - up_read(&mm->mmap_sem); - return error; + up_write(&mm->mmap_sem); + return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -1963,7 +1963,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, addr); prctl_map.start_code = mm->start_code; @@ -2056,7 +2056,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); return error; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0623787ec67a..2c5e3a8e00d7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid); cond_syscall(sys_setfsgid); cond_syscall(sys_capget); cond_syscall(sys_capset); +cond_syscall(sys_copy_file_range); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..97715fd9e790 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -173,7 +173,7 @@ extern int no_unaligned_warning; #define SYSCTL_WRITES_WARN 0 #define SYSCTL_WRITES_STRICT 1 -static int sysctl_writes_strict = SYSCTL_WRITES_WARN; +static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -1568,6 +1568,28 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif { } }; @@ -1735,6 +1757,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; @@ -2047,9 +2083,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, void *data) { int *i, vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2078,15 +2113,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first=0) { @@ -2094,11 +2123,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, bool neg; if (write) { - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); if (!left) break; - err = proc_get_long(&kbuf, &left, &lval, &neg, + err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2125,10 +2154,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err && left) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2310,9 +2338,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2340,15 +2367,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first = 0) { @@ -2357,9 +2378,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (write) { bool neg; - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); - err = proc_get_long(&kbuf, &left, &val, &neg, + err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2385,10 +2406,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2650,34 +2670,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - unsigned long page = 0; - char *kbuf; + char *kbuf, *p; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - free_page(page); - return -EFAULT; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), GFP_KERNEL); if (!tmp_bitmap) { - free_page(page); + kfree(kbuf); return -ENOMEM; } - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; bool neg; - err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); if (err) break; @@ -2688,12 +2701,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, val_b = val_a; if (left) { - kbuf++; + p++; left--; } if (c == '-') { - err = proc_get_long(&kbuf, &left, &val_b, + err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); if (err) @@ -2704,16 +2717,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, break; } if (left) { - kbuf++; + p++; left--; } } bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); } - free_page(page); + kfree(kbuf); } else { unsigned long bit_a, bit_b = 0; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 99ef0df12807..9d7a053545f5 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -143,7 +143,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; } @@ -387,7 +387,7 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; +int tick_nohz_enabled __read_mostly = 1; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode @@ -430,7 +430,7 @@ static void tick_nohz_update_jiffies(ktime_t now) tick_do_update_jiffies64(now); local_irq_restore(flags); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } /* @@ -717,7 +717,7 @@ static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int update_cpu_load_nohz(active); calc_load_exit_idle(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a990824c8604..2aeb6ffc0a1e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, if (count >= BLK_TN_MAX_MSG) return -EINVAL; - msg = kmalloc(count + 1, GFP_KERNEL); - if (msg == NULL) - return -ENOMEM; - - if (copy_from_user(msg, buffer, count)) { - kfree(msg); - return -EFAULT; - } + msg = memdup_user_nul(buffer, count); + if (IS_ERR(msg)) + return PTR_ERR(msg); - msg[count] = '\0'; bt = filp->private_data; __trace_note_message(bt, "%s", msg); kfree(msg); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4228fd3682c3..45dd798bcd37 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -316,7 +316,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -static struct bpf_verifier_ops kprobe_prog_ops = { +static const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f743b147247..eca592f977b2 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,8 +62,6 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) - #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ @@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; -static struct ftrace_ops control_ops; - -static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -203,7 +196,7 @@ void clear_ftrace_function(void) ftrace_trace_function = ftrace_stub; } -static void control_ops_disable_all(struct ftrace_ops *ops) +static void per_cpu_ops_disable_all(struct ftrace_ops *ops) { int cpu; @@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops) *per_cpu_ptr(ops->disabled, cpu) = 1; } -static int control_ops_alloc(struct ftrace_ops *ops) +static int per_cpu_ops_alloc(struct ftrace_ops *ops) { int __percpu *disabled; + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU))) + return -EINVAL; + disabled = alloc_percpu(int); if (!disabled) return -ENOMEM; ops->disabled = disabled; - control_ops_disable_all(ops); + per_cpu_ops_disable_all(ops); return 0; } @@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { } static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* - * If this is a dynamic ops or we force list func, + * If this is a dynamic, RCU, or per CPU ops, or we force list func, * then it needs to call the list anyway. */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU | + FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC) return ftrace_ops_list_func; return ftrace_ops_get_func(ops); @@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } -static void add_ftrace_list_ops(struct ftrace_ops **list, - struct ftrace_ops *main_ops, - struct ftrace_ops *ops) -{ - int first = *list == &ftrace_list_end; - add_ftrace_ops(list, ops); - if (first) - add_ftrace_ops(&ftrace_ops_list, main_ops); -} - -static int remove_ftrace_list_ops(struct ftrace_ops **list, - struct ftrace_ops *main_ops, - struct ftrace_ops *ops) -{ - int ret = remove_ftrace_ops(list, ops); - if (!ret && *list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); - return ret; -} - static void ftrace_update_trampoline(struct ftrace_ops *ops); static int __register_ftrace_function(struct ftrace_ops *ops) @@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_CONTROL) { - if (control_ops_alloc(ops)) + if (ops->flags & FTRACE_OPS_FL_PER_CPU) { + if (per_cpu_ops_alloc(ops)) return -ENOMEM; - add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); - /* The control_ops needs the trampoline update */ - ops = &control_ops; - } else - add_ftrace_ops(&ftrace_ops_list, ops); + } + + add_ftrace_ops(&ftrace_ops_list, ops); /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; @@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (ops->flags & FTRACE_OPS_FL_CONTROL) { - ret = remove_ftrace_list_ops(&ftrace_control_list, - &control_ops, ops); - } else - ret = remove_ftrace_ops(&ftrace_ops_list, ops); + ret = remove_ftrace_ops(&ftrace_ops_list, ops); if (ret < 0) return ret; @@ -1687,6 +1658,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int in_hash = 0; int match = 0; + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (all) { /* * Only the filter_hash affects all records. @@ -1940,7 +1914,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); } -static void print_ip_ins(const char *fmt, unsigned char *p) +static void print_ip_ins(const char *fmt, const unsigned char *p) { int i; @@ -1952,6 +1926,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); + +enum ftrace_bug_type ftrace_bug_type; +const void *ftrace_expected; + +static void print_bug_type(void) +{ + switch (ftrace_bug_type) { + case FTRACE_BUG_UNKNOWN: + break; + case FTRACE_BUG_INIT: + pr_info("Initializing ftrace call sites\n"); + break; + case FTRACE_BUG_NOP: + pr_info("Setting ftrace call site to NOP\n"); + break; + case FTRACE_BUG_CALL: + pr_info("Setting ftrace call site to call ftrace function\n"); + break; + case FTRACE_BUG_UPDATE: + pr_info("Updating ftrace call site to call a different ftrace function\n"); + break; + } +} /** * ftrace_bug - report and shutdown function tracer @@ -1979,8 +1978,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); - print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); + if (ftrace_expected) { + print_ip_ins(" expected: ", ftrace_expected); + pr_cont("\n"); + } break; case -EPERM: FTRACE_WARN_ON_ONCE(1); @@ -1992,6 +1995,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } + print_bug_type(); if (rec) { struct ftrace_ops *ops = NULL; @@ -2000,15 +2004,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) rec->flags & FTRACE_FL_REGS ? " R" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); - if (ops) - pr_cont("\ttramp: %pS", - (void *)ops->trampoline); - else + if (ops) { + do { + pr_cont("\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else pr_cont("\ttramp: ERROR!"); } ip = ftrace_get_addr_curr(rec); - pr_cont(" expected tramp: %lx\n", ip); + pr_cont("\n expected tramp: %lx\n", ip); } } @@ -2016,6 +2024,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { unsigned long flag = 0UL; + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + + if (rec->flags & FTRACE_FL_DISABLED) + return FTRACE_UPDATE_IGNORE; + /* * If we are updating calls: * @@ -2077,9 +2090,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * from the save regs, to a non-save regs function or * vice versa, or from a trampoline call. */ - if (flag & FTRACE_FL_ENABLED) + if (flag & FTRACE_FL_ENABLED) { + ftrace_bug_type = FTRACE_BUG_CALL; return FTRACE_UPDATE_MAKE_CALL; + } + ftrace_bug_type = FTRACE_BUG_UPDATE; return FTRACE_UPDATE_MODIFY_CALL; } @@ -2096,6 +2112,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) FTRACE_FL_REGS_EN); } + ftrace_bug_type = FTRACE_BUG_NOP; return FTRACE_UPDATE_MAKE_NOP; } @@ -2145,6 +2162,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) } static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, + struct ftrace_ops *op) +{ + unsigned long ip = rec->ip; + + while_for_each_ftrace_op(op) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } + + return NULL; +} + +static struct ftrace_ops * ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) { struct ftrace_ops *op; @@ -2307,17 +2342,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; case FTRACE_UPDATE_MAKE_CALL: + ftrace_bug_type = FTRACE_BUG_CALL; return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: + ftrace_bug_type = FTRACE_BUG_NOP; return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: + ftrace_bug_type = FTRACE_BUG_UPDATE; return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } @@ -2425,6 +2465,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { + ftrace_bug_type = FTRACE_BUG_INIT; ftrace_bug(ret, rec); return 0; } @@ -2566,7 +2607,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } -static void control_ops_free(struct ftrace_ops *ops) +static void per_cpu_ops_free(struct ftrace_ops *ops) { free_percpu(ops->disabled); } @@ -2667,13 +2708,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are control ops, they still need their + * If these are per_cpu ops, they still need their * per_cpu field freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & FTRACE_OPS_FL_PER_CPU) + per_cpu_ops_free(ops); return 0; } @@ -2714,7 +2755,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. - * The same goes for freeing the per_cpu data of the control + * The same goes for freeing the per_cpu data of the per_cpu * ops. * * Again, normal synchronize_sched() is not good enough. @@ -2725,13 +2766,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { schedule_on_each_cpu(ftrace_sync); arch_ftrace_trampoline_free(ops); - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & FTRACE_OPS_FL_PER_CPU) + per_cpu_ops_free(ops); } return 0; @@ -2798,9 +2839,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) return 0; - /* If ops traces all mods, we already accounted for it */ + /* If ops traces all then it includes this function */ if (ops_traces_mod(ops)) - return 0; + return 1; /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && @@ -2814,64 +2855,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 1; } -static int referenced_filters(struct dyn_ftrace *rec) -{ - struct ftrace_ops *ops; - int cnt = 0; - - for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; - } - - return cnt; -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long update_cnt = 0; - unsigned long ref = 0; - bool test = false; + unsigned long rec_flags = 0; int i; + start = ftrace_now(raw_smp_processor_id()); + /* - * When adding a module, we need to check if tracers are - * currently enabled and if they are set to trace all functions. - * If they are, we need to enable the module functions as well - * as update the reference counts for those function records. + * When a module is loaded, this function is called to convert + * the calls to mcount in its text to nops, and also to create + * an entry in the ftrace data. Now, if ftrace is activated + * after this call, but before the module sets its text to + * read-only, the modification of enabling ftrace can fail if + * the read-only is done while ftrace is converting the calls. + * To prevent this, the module's records are set as disabled + * and will be enabled after the call to set the module's text + * to read-only. */ - if (mod) { - struct ftrace_ops *ops; - - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED) { - if (ops_traces_mod(ops)) - ref++; - else - test = true; - } - } - } - - start = ftrace_now(raw_smp_processor_id()); + if (mod) + rec_flags |= FTRACE_FL_DISABLED; for (pg = new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { - int cnt = ref; /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; p = &pg->records[i]; - if (test) - cnt += referenced_filters(p); - p->flags = cnt; + p->flags = rec_flags; /* * Do the initial record conversion from mcount jump @@ -2881,21 +2899,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) break; update_cnt++; - - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && cnt) { - int failed = __ftrace_replace_code(p, 1); - if (failed) - ftrace_bug(failed, p); - } } } @@ -3258,7 +3261,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) { - struct ftrace_ops *ops = NULL; + struct ftrace_ops *ops; seq_printf(m, " (%ld)%s%s", ftrace_rec_count(rec), @@ -3266,14 +3269,19 @@ static int t_show(struct seq_file *m, void *v) rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); - if (ops) - seq_printf(m, "\ttramp: %pS", - (void *)ops->trampoline); - else + if (ops) { + do { + seq_printf(m, "\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + add_trampoline_func(m, ops, rec); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else seq_puts(m, "\ttramp: ERROR!"); - + } else { + add_trampoline_func(m, NULL, rec); } - add_trampoline_func(m, ops, rec); } seq_putc(m, '\n'); @@ -4898,6 +4906,19 @@ static int ftrace_process_locs(struct module *mod, #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; +} + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; @@ -4940,41 +4961,112 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +static void ftrace_module_enable(struct module *mod) { - if (ftrace_disabled || start == end) - return; - ftrace_process_locs(mod, start, end); + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + mutex_lock(&ftrace_lock); + + if (ftrace_disabled) + goto out_unlock; + + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + * + * We also delay this to after the module code already set the + * text to read-only, as we now need to set it back to read-write + * so that we can modify the text. + */ + if (ftrace_start_up) + ftrace_arch_code_modify_prepare(); + + do_for_each_ftrace_rec(pg, rec) { + int cnt; + /* + * do_for_each_ftrace_rec() is a double loop. + * module text shares the pg. If a record is + * not part of this module, then skip this pg, + * which the "break" will do. + */ + if (!within_module_core(rec->ip, mod)) + break; + + cnt = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are, and can trace this record, + * we need to enable the module functions as well as update the + * reference counts for those function records. + */ + if (ftrace_start_up) + cnt += referenced_filters(rec); + + /* This clears FTRACE_FL_DISABLED */ + rec->flags = cnt; + + if (ftrace_start_up && cnt) { + int failed = __ftrace_replace_code(rec, 1); + if (failed) { + ftrace_bug(failed, rec); + goto out_loop; + } + } + + } while_for_each_ftrace_rec(); + + out_loop: + if (ftrace_start_up) + ftrace_arch_code_modify_post_process(); + + out_unlock: + mutex_unlock(&ftrace_lock); } void ftrace_module_init(struct module *mod) { - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); + if (ftrace_disabled || !mod->num_ftrace_callsites) + return; + + ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); } -static int ftrace_module_notify_exit(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; - if (val == MODULE_STATE_GOING) + switch (val) { + case MODULE_STATE_COMING: + ftrace_module_enable(mod); + break; + case MODULE_STATE_GOING: ftrace_release_mod(mod); + break; + default: + break; + } return 0; } #else -static int ftrace_module_notify_exit(struct notifier_block *self, - unsigned long val, void *data) +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { return 0; } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_exit_nb = { - .notifier_call = ftrace_module_notify_exit, +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, .priority = INT_MIN, /* Run after anything that can remove kprobes */ }; @@ -5006,7 +5098,7 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_exit_nb); + ret = register_module_notifier(&ftrace_module_nb); if (ret) pr_warning("Failed to register trace ftrace module exit notifier\n"); @@ -5116,44 +5208,6 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static void -ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) -{ - if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) - return; - - /* - * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). - */ - preempt_disable_notrace(); - trace_recursion_set(TRACE_CONTROL_BIT); - - /* - * Control funcs (perf) uses RCU. Only trace if - * RCU is currently active. - */ - if (!rcu_is_watching()) - goto out; - - do_for_each_ftrace_op(op, ftrace_control_list) { - if (!(op->flags & FTRACE_OPS_FL_STUB) && - !ftrace_function_local_disabled(op) && - ftrace_ops_test(op, ip, regs)) - op->func(ip, parent_ip, op, regs); - } while_for_each_ftrace_op(op); - out: - trace_recursion_clear(TRACE_CONTROL_BIT); - preempt_enable_notrace(); -} - -static struct ftrace_ops control_ops = { - .func = ftrace_ops_control_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(control_ops) -}; - static inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) @@ -5170,8 +5224,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); + do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip, regs)) { + /* + * Check the following for each ops before calling their func: + * if RCU flag is set, then rcu_is_watching() must be true + * if PER_CPU is set, then ftrace_function_local_disable() + * must be false + * Otherwise test if the ip matches the ops filter + * + * If any of the above fails then the op->func() is not executed. + */ + if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) && + (!(op->flags & FTRACE_OPS_FL_PER_CPU) || + !ftrace_function_local_disabled(op)) && + ftrace_ops_test(op, ip, regs)) { + if (FTRACE_WARN_ON(!op->func)) { pr_warn("op=%p %pS\n", op, op); goto out; @@ -5195,7 +5263,7 @@ out: * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still - * set the ARCH_SUPPORT_FTARCE_OPS. + * set the ARCH_SUPPORTS_FTRACE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -5212,20 +5280,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) /* * If there's only one function registered but it does not support - * recursion, this function will be called by the mcount trampoline. - * This function will handle recursion protection. + * recursion, needs RCU protection and/or requires per cpu handling, then + * this function will be called by the mcount trampoline. */ -static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, +static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { int bit; + if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) + return; + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; - op->func(ip, parent_ip, op, regs); + preempt_disable_notrace(); + if (!(op->flags & FTRACE_OPS_FL_PER_CPU) || + !ftrace_function_local_disabled(op)) { + op->func(ip, parent_ip, op, regs); + } + + preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -5243,12 +5320,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { /* - * If the func handles its own recursion, call it directly. - * Otherwise call the recursion protected function that - * will call the ftrace ops function. + * If the function does not handle recursion, needs to be RCU safe, + * or does per cpu logic, then we need to call the assist handler. */ - if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) - return ftrace_ops_recurs_func; + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || + ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU)) + return ftrace_ops_assist_func; return ops->func; } diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9c6045a27ba3..95181e36891a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old, /* * rb_tail_page_update - move the tail page forward - * - * Returns 1 if moved tail page, 0 if someone else did. */ -static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, +static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { - struct buffer_page *old_tail; unsigned long old_entries; unsigned long old_write; - int ret = 0; /* * The tail page now needs to be moved forward. @@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, * it is, then it is up to us to update the tail * pointer. */ - if (tail_page == cpu_buffer->tail_page) { + if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { /* Zero the write counter */ unsigned long val = old_write & ~RB_WRITE_MASK; unsigned long eval = old_entries & ~RB_WRITE_MASK; @@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - old_tail = cmpxchg(&cpu_buffer->tail_page, - tail_page, next_page); - - if (old_tail == tail_page) - ret = 1; + /* Again, either we update tail_page or an interrupt does */ + (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); } - - return ret; } static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, @@ -2036,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the tail page would have moved. */ if (ret == RB_PAGE_NORMAL) { + struct buffer_page *buffer_tail_page; + + buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); /* * If the tail had moved passed next, then we need * to reset the pointer. */ - if (cpu_buffer->tail_page != tail_page && - cpu_buffer->tail_page != next_page) + if (buffer_tail_page != tail_page && + buffer_tail_page != next_page) rb_head_page_set_normal(cpu_buffer, new_head, next_page, RB_PAGE_HEAD); @@ -2135,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); + /* * This is the slow path, force gcc not to inline it. */ @@ -2147,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; - u64 ts; next_page = tail_page; @@ -2221,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, } } - ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); - if (ret) { - /* - * Nested commits always have zero deltas, so - * just reread the time stamp - */ - ts = rb_time_stamp(buffer); - next_page->page->time_stamp = ts; - } + rb_tail_page_update(cpu_buffer, tail_page, next_page); out_again: rb_reset_tail(cpu_buffer, tail, info); + /* Commit what we have for now. */ + rb_end_commit(cpu_buffer); + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -2362,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, addr = (unsigned long)event; addr &= PAGE_MASK; - bpage = cpu_buffer->tail_page; + bpage = READ_ONCE(cpu_buffer->tail_page); if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = @@ -2410,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) again: max_count = cpu_buffer->nr_pages * 100; - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) return; if (RB_WARN_ON(cpu_buffer, @@ -2419,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; + /* Only update the write stamp if the page has an event */ + if (rb_page_write(cpu_buffer->commit_page)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -2443,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * and pushed the tail page forward, we will be left with * a dangling commit that will never go forward. */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) goto again; } @@ -2699,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - tail_page = info->tail_page = cpu_buffer->tail_page; + /* Don't let the compiler play games with cpu_buffer->tail_page */ + tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 87fb9801bd9e..d9293402ee68 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1751,7 +1751,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 919d9d07686f..8414fa40bf27 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -363,8 +363,8 @@ struct trace_option_dentry { * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer - * @start: called when tracing is unpaused (echo 1 > tracing_enabled) - * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @start: called when tracing is unpaused (echo 1 > tracing_on) + * @stop: called when tracing is paused (echo 0 > tracing_on) * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened @@ -467,8 +467,6 @@ enum { TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, - TRACE_CONTROL_BIT, - TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index cc9f7a9319be..00df25fd86ef 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; return register_ftrace_function(ops); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 4f6ef6912e00..f333e57c4614 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1340,15 +1340,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); mutex_lock(&event_mutex); file = event_file_data(filp); @@ -1356,7 +1350,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); - free_page((unsigned long) buf); + kfree(buf); if (err < 0) return err; @@ -1507,18 +1501,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); err = apply_subsystem_event_filter(dir, buf); - free_page((unsigned long) buf); + kfree(buf); if (err < 0) return err; diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 42a4009fd75a..b38f617b6181 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -237,28 +237,23 @@ static ssize_t event_trigger_regex_write(struct file *file, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long)buf); - return -EFAULT; - } - buf[cnt] = '\0'; strim(buf); mutex_lock(&event_mutex); event_file = event_file_data(file); if (unlikely(!event_file)) { mutex_unlock(&event_mutex); - free_page((unsigned long)buf); + kfree(buf); return -ENODEV; } ret = trigger_process_regex(event_file, buf); mutex_unlock(&event_mutex); - free_page((unsigned long)buf); + kfree(buf); if (ret < 0) goto out; @@ -543,11 +538,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, list_add_rcu(&data->list, &file->triggers); ret++; + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); + update_cond_flag(file); ret--; } - update_cond_flag(file); out: return ret; } @@ -575,8 +571,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { unregistered = true; list_del_rcu(&data->list); - update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); break; } } @@ -1319,11 +1315,12 @@ static int event_enable_register_trigger(char *glob, list_add_rcu(&data->list, &file->triggers); ret++; + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); + update_cond_flag(file); ret--; } - update_cond_flag(file); out: return ret; } @@ -1344,8 +1341,8 @@ static void event_enable_unregister_trigger(char *glob, (enable_data->file == test_enable_data->file)) { unregistered = true; list_del_rcu(&data->list); - update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); break; } } diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 88fefa68c516..9bafc211930c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent *extent = NULL; - unsigned long page = 0; - char *kbuf, *pos, *next_line; + char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; /* @@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; - /* Get a buffer */ - ret = -ENOMEM; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!page) - goto out; - /* Only allow < page size writes at the beginning of the file */ ret = -EINVAL; if ((*ppos != 0) || (count >= PAGE_SIZE)) goto out; /* Slurp in the user data */ - ret = -EFAULT; - if (copy_from_user(kbuf, buf, count)) + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) { + ret = PTR_ERR(kbuf); + kbuf = NULL; goto out; - kbuf[count] = '\0'; + } /* Parse the user data */ ret = -EINVAL; @@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = count; out: mutex_unlock(&userns_state_mutex); - if (page) - free_page(page); + kfree(kbuf); return ret; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 84b5035cb6a5..b3ace6ebbba3 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -20,6 +20,7 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -225,7 +226,15 @@ static void __touch_watchdog(void) __this_cpu_write(watchdog_touch_ts, get_timestamp()); } -void touch_softlockup_watchdog(void) +/** + * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls + * + * Call when the scheduler may have stalled for legitimate reasons + * preventing the watchdog task from executing - e.g. the scheduler + * entering idle state. This should only be used for scheduler events. + * Use touch_softlockup_watchdog() for everything else. + */ +void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void) */ raw_cpu_write(watchdog_touch_ts, 0); } + +void touch_softlockup_watchdog(void) +{ + touch_softlockup_watchdog_sched(); + wq_watchdog_touch(raw_smp_processor_id()); +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void) */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + wq_watchdog_touch(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c579dbab2e36..61a0264e28f9 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* X: flags */ + unsigned long watchdog_ts; /* L: watchdog timestamp */ + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -1083,6 +1085,8 @@ static void pwq_activate_delayed_work(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); + if (list_empty(&pwq->pool->worklist)) + pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; @@ -1385,6 +1389,8 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; worklist = &pwq->pool->worklist; + if (list_empty(worklist)) + pwq->pool->watchdog_ts = jiffies; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; @@ -2157,6 +2163,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); + pool->watchdog_ts = jiffies; + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); @@ -2240,6 +2248,7 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; + bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2256,9 +2265,14 @@ repeat: * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_pwq(work) == pwq) + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + if (first) + pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + } + first = false; + } if (!list_empty(scheduled)) { process_scheduled_works(rescuer); @@ -2316,6 +2330,37 @@ repeat: goto repeat; } +/** + * check_flush_dependency - check for flush dependency sanity + * @target_wq: workqueue being flushed + * @target_work: work item being flushed (NULL for workqueue flushes) + * + * %current is trying to flush the whole @target_wq or @target_work on it. + * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not + * reclaiming memory or running on a workqueue which doesn't have + * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to + * a deadlock. + */ +static void check_flush_dependency(struct workqueue_struct *target_wq, + struct work_struct *target_work) +{ + work_func_t target_func = target_work ? target_work->func : NULL; + struct worker *worker; + + if (target_wq->flags & WQ_MEM_RECLAIM) + return; + + worker = current_wq_worker(); + + WARN_ONCE(current->flags & PF_MEMALLOC, + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + current->pid, current->comm, target_wq->name, target_func); + WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM), + "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + worker->current_pwq->wq->name, worker->current_func, + target_wq->name, target_func); +} + struct wq_barrier { struct work_struct work; struct completion done; @@ -2525,6 +2570,8 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } + check_flush_dependency(wq, NULL); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2697,6 +2744,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) pwq = worker->current_pwq; } + check_flush_dependency(pwq->wq, work); + insert_wq_barrier(pwq, barr, work, worker); spin_unlock_irq(&pool->lock); @@ -3069,6 +3118,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; + pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -3601,7 +3651,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; - int ret = -ENOMEM; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3612,16 +3661,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; ctx = apply_wqattrs_prepare(wq, attrs); + if (!ctx) + return -ENOMEM; /* the ctx has been prepared successfully, let's commit it */ - if (ctx) { - apply_wqattrs_commit(ctx); - ret = 0; - } - + apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); - return ret; + return 0; } /** @@ -4308,7 +4355,9 @@ void show_workqueue_state(void) pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" workers=%d", pool->nr_workers); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -5167,6 +5216,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ +/* + * Workqueue watchdog. + * + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal + * flush dependency, a concurrency managed work item which stays RUNNING + * indefinitely. Workqueue stalls can be very difficult to debug as the + * usual warning mechanisms don't trigger and internal workqueue state is + * largely opaque. + * + * Workqueue watchdog monitors all worker pools periodically and dumps + * state if some pools failed to make forward progress for a while where + * forward progress is defined as the first item on ->worklist changing. + * + * This mechanism is controlled through the kernel parameter + * "workqueue.watchdog_thresh" which can be updated at runtime through the + * corresponding sysfs parameter file. + */ +#ifdef CONFIG_WQ_WATCHDOG + +static void wq_watchdog_timer_fn(unsigned long data); + +static unsigned long wq_watchdog_thresh = 30; +static struct timer_list wq_watchdog_timer = + TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); + +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + +static void wq_watchdog_reset_touched(void) +{ + int cpu; + + wq_watchdog_touched = jiffies; + for_each_possible_cpu(cpu) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; +} + +static void wq_watchdog_timer_fn(unsigned long data) +{ + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + bool lockup_detected = false; + struct worker_pool *pool; + int pi; + + if (!thresh) + return; + + rcu_read_lock(); + + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + + if (list_empty(&pool->worklist)) + continue; + + /* get the latest of pool and touched timestamps */ + pool_ts = READ_ONCE(pool->watchdog_ts); + touched = READ_ONCE(wq_watchdog_touched); + + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + + if (pool->cpu >= 0) { + unsigned long cpu_touched = + READ_ONCE(per_cpu(wq_watchdog_touched_cpu, + pool->cpu)); + if (time_after(cpu_touched, ts)) + ts = cpu_touched; + } + + /* did we stall? */ + if (time_after(jiffies, ts + thresh)) { + lockup_detected = true; + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(jiffies - pool_ts) / 1000); + } + } + + rcu_read_unlock(); + + if (lockup_detected) + show_workqueue_state(); + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); +} + +void wq_watchdog_touch(int cpu) +{ + if (cpu >= 0) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + else + wq_watchdog_touched = jiffies; +} + +static void wq_watchdog_set_thresh(unsigned long thresh) +{ + wq_watchdog_thresh = 0; + del_timer_sync(&wq_watchdog_timer); + + if (thresh) { + wq_watchdog_thresh = thresh; + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); + } +} + +static int wq_watchdog_param_set_thresh(const char *val, + const struct kernel_param *kp) +{ + unsigned long thresh; + int ret; + + ret = kstrtoul(val, 0, &thresh); + if (ret) + return ret; + + if (system_wq) + wq_watchdog_set_thresh(thresh); + else + wq_watchdog_thresh = thresh; + + return 0; +} + +static const struct kernel_param_ops wq_watchdog_thresh_ops = { + .set = wq_watchdog_param_set_thresh, + .get = param_get_ulong, +}; + +module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, + 0644); + +static void wq_watchdog_init(void) +{ + wq_watchdog_set_thresh(wq_watchdog_thresh); +} + +#else /* CONFIG_WQ_WATCHDOG */ + +static inline void wq_watchdog_init(void) { } + +#endif /* CONFIG_WQ_WATCHDOG */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; @@ -5290,6 +5487,9 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + wq_watchdog_init(); + return 0; } early_initcall(init_workqueues); |