diff options
author | Jiri Kosina <jkosina@suse.cz> | 2016-04-18 11:18:55 +0200 |
---|---|---|
committer | Jiri Kosina <jkosina@suse.cz> | 2016-04-18 11:18:55 +0200 |
commit | 9938b04472d5c59f8bd8152a548533a8599596a2 (patch) | |
tree | 0fc8318100878c5e446076613ec02a97aa179119 /kernel | |
parent | bd7ced98812dbb906950d8b0ec786f14f631cede (diff) | |
parent | c3b46c73264b03000d1e18b22f5caf63332547c9 (diff) | |
download | linux-9938b04472d5c59f8bd8152a548533a8599596a2.tar.gz |
Merge branch 'master' into for-next
Sync with Linus' tree so that patches against newer codebase can be applied.
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'kernel')
182 files changed, 12066 insertions, 5618 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 53abf008ecb3..f0c40bf49d9f 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -14,11 +14,21 @@ obj-y = fork.o exec_domain.o panic.o \ obj-$(CONFIG_MULTIUSER) += groups.o ifdef CONFIG_FUNCTION_TRACER -# Do not trace debug files and internal ftrace files -CFLAGS_REMOVE_cgroup-debug.o = $(CC_FLAGS_FTRACE) +# Do not trace internal ftrace files CFLAGS_REMOVE_irq_work.o = $(CC_FLAGS_FTRACE) endif +# Prevents flicker of uninteresting __do_softirq()/__local_bh_disable_ip() +# in coverage traces. +KCOV_INSTRUMENT_softirq.o := n +# These are called from save_stack_trace() on slub debug path, +# and produce insane amounts of uninteresting coverage. +KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_extable.o := n +# Don't self-instrument. +KCOV_INSTRUMENT_kcov.o := n +KASAN_SANITIZE_kcov.o := n + # cond_syscall is currently not LTO compatible CFLAGS_sys_ni.o = $(DISABLE_LTO) @@ -69,6 +79,7 @@ obj-$(CONFIG_AUDITSYSCALL) += auditsc.o obj-$(CONFIG_AUDIT_WATCH) += audit_watch.o audit_fsnotify.o obj-$(CONFIG_AUDIT_TREE) += audit_tree.o obj-$(CONFIG_GCOV_KERNEL) += gcov/ +obj-$(CONFIG_KCOV) += kcov.o obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_KGDB) += debug/ obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o diff --git a/kernel/async.c b/kernel/async.c index 4c3773c0bf63..d2edd6efec56 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -326,3 +326,4 @@ bool current_is_async(void) return worker && worker->current_func == async_run_entry_fn; } +EXPORT_SYMBOL_GPL(current_is_async); diff --git a/kernel/audit.c b/kernel/audit.c index 8a056a32ded7..678c3f000191 100644 --- a/kernel/audit.c +++ b/kernel/audit.c @@ -110,7 +110,6 @@ static u32 audit_backlog_limit = 64; #define AUDIT_BACKLOG_WAIT_TIME (60 * HZ) static u32 audit_backlog_wait_time_master = AUDIT_BACKLOG_WAIT_TIME; static u32 audit_backlog_wait_time = AUDIT_BACKLOG_WAIT_TIME; -static u32 audit_backlog_wait_overflow = 0; /* The identity of the user shutting down the audit system. */ kuid_t audit_sig_uid = INVALID_UID; @@ -509,8 +508,7 @@ static void flush_hold_queue(void) * if auditd just disappeared but we * dequeued an skb we need to drop ref */ - if (skb) - consume_skb(skb); + consume_skb(skb); } static int kauditd_thread(void *dummy) @@ -524,7 +522,8 @@ static int kauditd_thread(void *dummy) skb = skb_dequeue(&audit_skb_queue); if (skb) { - if (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit) + if (!audit_backlog_limit || + (skb_queue_len(&audit_skb_queue) <= audit_backlog_limit)) wake_up(&audit_backlog_wait); if (audit_pid) kauditd_send_skb(skb); @@ -810,6 +809,16 @@ static int audit_set_feature(struct sk_buff *skb) return 0; } +static int audit_replace(pid_t pid) +{ + struct sk_buff *skb = audit_make_reply(0, 0, AUDIT_REPLACE, 0, 0, + &pid, sizeof(pid)); + + if (!skb) + return -ENOMEM; + return netlink_unicast(audit_sock, skb, audit_nlk_portid, 0); +} + static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) { u32 seq; @@ -871,9 +880,17 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) } if (s.mask & AUDIT_STATUS_PID) { int new_pid = s.pid; + pid_t requesting_pid = task_tgid_vnr(current); - if ((!new_pid) && (task_tgid_vnr(current) != audit_pid)) + if ((!new_pid) && (requesting_pid != audit_pid)) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); return -EACCES; + } + if (audit_pid && new_pid && + audit_replace(requesting_pid) != -ECONNREFUSED) { + audit_log_config_change("audit_pid", new_pid, audit_pid, 0); + return -EEXIST; + } if (audit_enabled != AUDIT_OFF) audit_log_config_change("audit_pid", new_pid, audit_pid, 1); audit_pid = new_pid; @@ -921,7 +938,7 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) if (err == 1) { /* match or error */ err = 0; if (msg_type == AUDIT_USER_TTY) { - err = tty_audit_push_current(); + err = tty_audit_push(); if (err) break; } @@ -1031,20 +1048,19 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) break; case AUDIT_TTY_GET: { struct audit_tty_status s; - struct task_struct *tsk = current; + unsigned int t; - spin_lock(&tsk->sighand->siglock); - s.enabled = tsk->signal->audit_tty; - s.log_passwd = tsk->signal->audit_tty_log_passwd; - spin_unlock(&tsk->sighand->siglock); + t = READ_ONCE(current->signal->audit_tty); + s.enabled = t & AUDIT_TTY_ENABLE; + s.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_send_reply(skb, seq, AUDIT_TTY_GET, 0, 0, &s, sizeof(s)); break; } case AUDIT_TTY_SET: { struct audit_tty_status s, old; - struct task_struct *tsk = current; struct audit_buffer *ab; + unsigned int t; memset(&s, 0, sizeof(s)); /* guard against past and future API changes */ @@ -1054,14 +1070,14 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh) (s.log_passwd != 0 && s.log_passwd != 1)) err = -EINVAL; - spin_lock(&tsk->sighand->siglock); - old.enabled = tsk->signal->audit_tty; - old.log_passwd = tsk->signal->audit_tty_log_passwd; - if (!err) { - tsk->signal->audit_tty = s.enabled; - tsk->signal->audit_tty_log_passwd = s.log_passwd; + if (err) + t = READ_ONCE(current->signal->audit_tty); + else { + t = s.enabled | (-s.log_passwd & AUDIT_TTY_LOG_PASSWD); + t = xchg(¤t->signal->audit_tty, t); } - spin_unlock(&tsk->sighand->siglock); + old.enabled = t & AUDIT_TTY_ENABLE; + old.log_passwd = !!(t & AUDIT_TTY_LOG_PASSWD); audit_log_common_recv_msg(&ab, AUDIT_CONFIG_CHANGE); audit_log_format(ab, " op=tty_set old-enabled=%d new-enabled=%d" @@ -1232,9 +1248,7 @@ static void audit_buffer_free(struct audit_buffer *ab) if (!ab) return; - if (ab->skb) - kfree_skb(ab->skb); - + kfree_skb(ab->skb); spin_lock_irqsave(&audit_freelist_lock, flags); if (audit_freelist_count > AUDIT_MAXFREE) kfree(ab); @@ -1371,16 +1385,16 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, if (unlikely(audit_filter_type(type))) return NULL; - if (gfp_mask & __GFP_WAIT) { - if (audit_pid && audit_pid == current->pid) - gfp_mask &= ~__GFP_WAIT; + if (gfp_mask & __GFP_DIRECT_RECLAIM) { + if (audit_pid && audit_pid == current->tgid) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; else reserve = 0; } while (audit_backlog_limit && skb_queue_len(&audit_skb_queue) > audit_backlog_limit + reserve) { - if (gfp_mask & __GFP_WAIT && audit_backlog_wait_time) { + if (gfp_mask & __GFP_DIRECT_RECLAIM && audit_backlog_wait_time) { long sleep_time; sleep_time = timeout_start + audit_backlog_wait_time - jiffies; @@ -1395,12 +1409,12 @@ struct audit_buffer *audit_log_start(struct audit_context *ctx, gfp_t gfp_mask, skb_queue_len(&audit_skb_queue), audit_backlog_limit); audit_log_lost("backlog limit exceeded"); - audit_backlog_wait_time = audit_backlog_wait_overflow; + audit_backlog_wait_time = 0; wake_up(&audit_backlog_wait); return NULL; } - if (!reserve) + if (!reserve && !audit_backlog_wait_time) audit_backlog_wait_time = audit_backlog_wait_time_master; ab = audit_buffer_alloc(ctx, gfp_mask, type); @@ -1722,7 +1736,7 @@ static inline int audit_copy_fcaps(struct audit_names *name, /* Copy inode data into an audit_names. */ void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode) + struct inode *inode) { name->ino = inode->i_ino; name->dev = inode->i_sb->s_dev; diff --git a/kernel/audit.h b/kernel/audit.h index de6cbb7cf547..cbbe6bb6496e 100644 --- a/kernel/audit.h +++ b/kernel/audit.h @@ -207,7 +207,7 @@ extern u32 audit_ever_enabled; extern void audit_copy_inode(struct audit_names *name, const struct dentry *dentry, - const struct inode *inode); + struct inode *inode); extern void audit_log_cap(struct audit_buffer *ab, char *prefix, kernel_cap_t *cap); extern void audit_log_name(struct audit_context *context, diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 27c6046c2c3d..f84f8d06e1f6 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa if (IS_ERR(dentry)) return (void *)dentry; /* returning an error */ inode = path.dentry->d_inode; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..3cf1c5978d39 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -185,7 +185,7 @@ static struct audit_watch *audit_init_watch(char *path) return watch; } -/* Translate a watch string to kernel respresentation. */ +/* Translate a watch string to kernel representation. */ int audit_to_watch(struct audit_krule *krule, char *path, int len, u32 op) { struct audit_watch *watch; @@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex); + inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d_backing_inode(d)->i_sb->s_dev; diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c index b8ff9e193753..94ca7b1e5e7e 100644 --- a/kernel/auditfilter.c +++ b/kernel/auditfilter.c @@ -158,7 +158,7 @@ char *audit_unpack_string(void **bufp, size_t *remain, size_t len) return str; } -/* Translate an inode field to kernel respresentation. */ +/* Translate an inode field to kernel representation. */ static inline int audit_to_inode(struct audit_krule *krule, struct audit_field *f) { @@ -415,7 +415,7 @@ static int audit_field_valid(struct audit_entry *entry, struct audit_field *f) return 0; } -/* Translate struct audit_rule_data to kernel's rule respresentation. */ +/* Translate struct audit_rule_data to kernel's rule representation. */ static struct audit_entry *audit_data_to_entry(struct audit_rule_data *data, size_t datasz) { @@ -593,7 +593,7 @@ static inline size_t audit_pack_string(void **bufp, const char *str) return len; } -/* Translate kernel rule respresentation to struct audit_rule_data. */ +/* Translate kernel rule representation to struct audit_rule_data. */ static struct audit_rule_data *audit_krule_to_data(struct audit_krule *krule) { struct audit_rule_data *data; diff --git a/kernel/auditsc.c b/kernel/auditsc.c index b86cc04959de..7d0e3cf8abe1 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -1754,7 +1754,7 @@ void __audit_inode(struct filename *name, const struct dentry *dentry, unsigned int flags) { struct audit_context *context = current->audit_context; - const struct inode *inode = d_backing_inode(dentry); + struct inode *inode = d_backing_inode(dentry); struct audit_names *n; bool parent = flags & AUDIT_INODE_PARENT; @@ -1848,12 +1848,12 @@ void __audit_file(const struct file *file) * must be hooked prior, in order to capture the target inode during * unsuccessful attempts. */ -void __audit_inode_child(const struct inode *parent, +void __audit_inode_child(struct inode *parent, const struct dentry *dentry, const unsigned char type) { struct audit_context *context = current->audit_context; - const struct inode *inode = d_backing_inode(dentry); + struct inode *inode = d_backing_inode(dentry); const char *dname = dentry->d_name.name; struct audit_names *n, *found_parent = NULL, *found_child = NULL; @@ -2412,8 +2412,8 @@ void __audit_seccomp(unsigned long syscall, long signr, int code) return; audit_log_task(ab); audit_log_format(ab, " sig=%ld arch=%x syscall=%ld compat=%d ip=0x%lx code=0x%x", - signr, syscall_get_arch(), syscall, is_compat_task(), - KSTK_EIP(current), code); + signr, syscall_get_arch(), syscall, + in_compat_syscall(), KSTK_EIP(current), code); audit_log_end(ab); } diff --git a/kernel/bpf/Makefile b/kernel/bpf/Makefile index 13272582eee0..eed911d091da 100644 --- a/kernel/bpf/Makefile +++ b/kernel/bpf/Makefile @@ -1,4 +1,7 @@ obj-y := core.o obj-$(CONFIG_BPF_SYSCALL) += syscall.o verifier.o inode.o helpers.o -obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o +obj-$(CONFIG_BPF_SYSCALL) += hashtab.o arraymap.o percpu_freelist.o +ifeq ($(CONFIG_PERF_EVENTS),y) +obj-$(CONFIG_BPF_SYSCALL) += stackmap.o +endif diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index 3f4c99e06c6b..76d5a794e426 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -17,25 +17,63 @@ #include <linux/filter.h> #include <linux/perf_event.h> +static void bpf_array_free_percpu(struct bpf_array *array) +{ + int i; + + for (i = 0; i < array->map.max_entries; i++) + free_percpu(array->pptrs[i]); +} + +static int bpf_array_alloc_percpu(struct bpf_array *array) +{ + void __percpu *ptr; + int i; + + for (i = 0; i < array->map.max_entries; i++) { + ptr = __alloc_percpu_gfp(array->elem_size, 8, + GFP_USER | __GFP_NOWARN); + if (!ptr) { + bpf_array_free_percpu(array); + return -ENOMEM; + } + array->pptrs[i] = ptr; + } + + return 0; +} + /* Called from syscall */ static struct bpf_map *array_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_ARRAY; struct bpf_array *array; - u32 elem_size, array_size; + u64 array_size; + u32 elem_size; /* check sanity of attributes */ if (attr->max_entries == 0 || attr->key_size != 4 || - attr->value_size == 0) + attr->value_size == 0 || attr->map_flags) return ERR_PTR(-EINVAL); + if (attr->value_size >= 1 << (KMALLOC_SHIFT_MAX - 1)) + /* if value_size is bigger, the user space won't be able to + * access the elements. + */ + return ERR_PTR(-E2BIG); + elem_size = round_up(attr->value_size, 8); - /* check round_up into zero and u32 overflow */ - if (elem_size == 0 || - attr->max_entries > (U32_MAX - sizeof(*array)) / elem_size) + array_size = sizeof(*array); + if (percpu) + array_size += (u64) attr->max_entries * sizeof(void *); + else + array_size += (u64) attr->max_entries * elem_size; + + /* make sure there is no u32 overflow later in round_up() */ + if (array_size >= U32_MAX - PAGE_SIZE) return ERR_PTR(-ENOMEM); - array_size = sizeof(*array) + attr->max_entries * elem_size; /* allocate all map elements and zero-initialize them */ array = kzalloc(array_size, GFP_USER | __GFP_NOWARN); @@ -46,12 +84,25 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr) } /* copy mandatory map attributes */ + array->map.map_type = attr->map_type; array->map.key_size = attr->key_size; array->map.value_size = attr->value_size; array->map.max_entries = attr->max_entries; - array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; array->elem_size = elem_size; + if (!percpu) + goto out; + + array_size += (u64) attr->max_entries * elem_size * num_possible_cpus(); + + if (array_size >= U32_MAX - PAGE_SIZE || + elem_size > PCPU_MIN_UNIT_SIZE || bpf_array_alloc_percpu(array)) { + kvfree(array); + return ERR_PTR(-ENOMEM); + } +out: + array->map.pages = round_up(array_size, PAGE_SIZE) >> PAGE_SHIFT; + return &array->map; } @@ -61,12 +112,50 @@ static void *array_map_lookup_elem(struct bpf_map *map, void *key) struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) return NULL; return array->value + array->elem_size * index; } +/* Called from eBPF program */ +static void *percpu_array_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + + if (unlikely(index >= array->map.max_entries)) + return NULL; + + return this_cpu_ptr(array->pptrs[index]); +} + +int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(index >= array->map.max_entries)) + return -ENOENT; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, per_cpu_ptr(pptr, cpu), size); + off += size; + } + rcu_read_unlock(); + return 0; +} + /* Called from syscall */ static int array_map_get_next_key(struct bpf_map *map, void *key, void *next_key) { @@ -93,19 +182,62 @@ static int array_map_update_elem(struct bpf_map *map, void *key, void *value, struct bpf_array *array = container_of(map, struct bpf_array, map); u32 index = *(u32 *)key; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; - if (index >= array->map.max_entries) + if (unlikely(index >= array->map.max_entries)) /* all elements were pre-allocated, cannot insert a new one */ return -E2BIG; - if (map_flags == BPF_NOEXIST) + if (unlikely(map_flags == BPF_NOEXIST)) /* all elements already exist */ return -EEXIST; - memcpy(array->value + array->elem_size * index, value, array->elem_size); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + memcpy(this_cpu_ptr(array->pptrs[index]), + value, map->value_size); + else + memcpy(array->value + array->elem_size * index, + value, map->value_size); + return 0; +} + +int bpf_percpu_array_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + struct bpf_array *array = container_of(map, struct bpf_array, map); + u32 index = *(u32 *)key; + void __percpu *pptr; + int cpu, off = 0; + u32 size; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + if (unlikely(index >= array->map.max_entries)) + /* all elements were pre-allocated, cannot insert a new one */ + return -E2BIG; + + if (unlikely(map_flags == BPF_NOEXIST)) + /* all elements already exist */ + return -EEXIST; + + /* the user space will provide round_up(value_size, 8) bytes that + * will be copied into per-cpu area. bpf programs can only access + * value_size of it. During lookup the same extra bytes will be + * returned or zeros which were zero-filled by percpu_alloc, + * so no kernel data leaks possible + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + pptr = array->pptrs[index]; + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), value + off, size); + off += size; + } + rcu_read_unlock(); return 0; } @@ -127,6 +259,9 @@ static void array_map_free(struct bpf_map *map) */ synchronize_rcu(); + if (array->map.map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + bpf_array_free_percpu(array); + kvfree(array); } @@ -144,9 +279,24 @@ static struct bpf_map_type_list array_type __read_mostly = { .type = BPF_MAP_TYPE_ARRAY, }; +static const struct bpf_map_ops percpu_array_ops = { + .map_alloc = array_map_alloc, + .map_free = array_map_free, + .map_get_next_key = array_map_get_next_key, + .map_lookup_elem = percpu_array_map_lookup_elem, + .map_update_elem = array_map_update_elem, + .map_delete_elem = array_map_delete_elem, +}; + +static struct bpf_map_type_list percpu_array_type __read_mostly = { + .ops = &percpu_array_ops, + .type = BPF_MAP_TYPE_PERCPU_ARRAY, +}; + static int __init register_array_map(void) { bpf_register_map_type(&array_type); + bpf_register_map_type(&percpu_array_type); return 0; } late_initcall(register_array_map); @@ -285,10 +435,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) { struct perf_event *event; const struct perf_event_attr *attr; + struct file *file; - event = perf_event_get(fd); - if (IS_ERR(event)) - return event; + file = perf_event_get(fd); + if (IS_ERR(file)) + return file; + + event = file->private_data; attr = perf_event_attrs(event); if (IS_ERR(attr)) @@ -298,24 +451,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) goto err; if (attr->type == PERF_TYPE_RAW) - return event; + return file; if (attr->type == PERF_TYPE_HARDWARE) - return event; + return file; if (attr->type == PERF_TYPE_SOFTWARE && attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return event; + return file; err: - perf_event_release_kernel(event); + fput(file); return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) { - struct perf_event *event = ptr; - - perf_event_release_kernel(event); + fput((struct file *)ptr); } static const struct bpf_map_ops perf_event_array_ops = { diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index 334b1bdd572c..be0abf669ced 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -27,6 +27,7 @@ #include <linux/random.h> #include <linux/moduleloader.h> #include <linux/bpf.h> +#include <linux/frame.h> #include <asm/unaligned.h> @@ -306,10 +307,6 @@ static unsigned int __bpf_prog_run(void *ctx, const struct bpf_insn *insn) FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; ARG1 = (u64) (unsigned long) ctx; - /* Registers used in classic BPF programs need to be reset first. */ - regs[BPF_REG_A] = 0; - regs[BPF_REG_X] = 0; - select_insn: goto *jumptable[insn->code]; @@ -653,6 +650,7 @@ load_byte: WARN_RATELIMIT(1, "unknown opcode %02x\n", insn->code); return 0; } +STACK_FRAME_NON_STANDARD(__bpf_prog_run); /* jump table */ bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog *fp) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 19909b22b4f8..fff3650d52fc 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -1,4 +1,5 @@ /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com + * Copyright (c) 2016 Facebook * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -13,38 +14,127 @@ #include <linux/jhash.h> #include <linux/filter.h> #include <linux/vmalloc.h> +#include "percpu_freelist.h" + +struct bucket { + struct hlist_head head; + raw_spinlock_t lock; +}; struct bpf_htab { struct bpf_map map; - struct hlist_head *buckets; - raw_spinlock_t lock; - u32 count; /* number of elements in this hashtable */ + struct bucket *buckets; + void *elems; + struct pcpu_freelist freelist; + atomic_t count; /* number of elements in this hashtable */ u32 n_buckets; /* number of hash buckets */ u32 elem_size; /* size of each element in bytes */ }; /* each htab element is struct htab_elem + key + value */ struct htab_elem { - struct hlist_node hash_node; + union { + struct hlist_node hash_node; + struct bpf_htab *htab; + struct pcpu_freelist_node fnode; + }; struct rcu_head rcu; u32 hash; char key[0] __aligned(8); }; +static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size, + void __percpu *pptr) +{ + *(void __percpu **)(l->key + key_size) = pptr; +} + +static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size) +{ + return *(void __percpu **)(l->key + key_size); +} + +static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i) +{ + return (struct htab_elem *) (htab->elems + i * htab->elem_size); +} + +static void htab_free_elems(struct bpf_htab *htab) +{ + int i; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto free_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + void __percpu *pptr; + + pptr = htab_elem_get_ptr(get_htab_elem(htab, i), + htab->map.key_size); + free_percpu(pptr); + } +free_elems: + vfree(htab->elems); +} + +static int prealloc_elems_and_freelist(struct bpf_htab *htab) +{ + int err = -ENOMEM, i; + + htab->elems = vzalloc(htab->elem_size * htab->map.max_entries); + if (!htab->elems) + return -ENOMEM; + + if (htab->map.map_type != BPF_MAP_TYPE_PERCPU_HASH) + goto skip_percpu_elems; + + for (i = 0; i < htab->map.max_entries; i++) { + u32 size = round_up(htab->map.value_size, 8); + void __percpu *pptr; + + pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN); + if (!pptr) + goto free_elems; + htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size, + pptr); + } + +skip_percpu_elems: + err = pcpu_freelist_init(&htab->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&htab->freelist, htab->elems, htab->elem_size, + htab->map.max_entries); + return 0; + +free_elems: + htab_free_elems(htab); + return err; +} + /* Called from syscall */ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) { + bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH; struct bpf_htab *htab; int err, i; + u64 cost; + + if (attr->map_flags & ~BPF_F_NO_PREALLOC) + /* reserved bits should not be used */ + return ERR_PTR(-EINVAL); htab = kzalloc(sizeof(*htab), GFP_USER); if (!htab) return ERR_PTR(-ENOMEM); /* mandatory map attributes */ + htab->map.map_type = attr->map_type; htab->map.key_size = attr->key_size; htab->map.value_size = attr->value_size; htab->map.max_entries = attr->max_entries; + htab->map.map_flags = attr->map_flags; /* check sanity of attributes. * value_size == 0 may be allowed in the future to use map as a set @@ -64,36 +154,74 @@ static struct bpf_map *htab_map_alloc(union bpf_attr *attr) */ goto free_htab; - err = -ENOMEM; + if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) - + MAX_BPF_STACK - sizeof(struct htab_elem)) + /* if value_size is bigger, the user space won't be able to + * access the elements via bpf syscall. This check also makes + * sure that the elem_size doesn't overflow and it's + * kmalloc-able later in htab_map_update_elem() + */ + goto free_htab; + + if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE) + /* make sure the size for pcpu_alloc() is reasonable */ + goto free_htab; + + htab->elem_size = sizeof(struct htab_elem) + + round_up(htab->map.key_size, 8); + if (percpu) + htab->elem_size += sizeof(void *); + else + htab->elem_size += round_up(htab->map.value_size, 8); + /* prevent zero size kmalloc and check for u32 overflow */ if (htab->n_buckets == 0 || - htab->n_buckets > U32_MAX / sizeof(struct hlist_head)) + htab->n_buckets > U32_MAX / sizeof(struct bucket)) + goto free_htab; + + cost = (u64) htab->n_buckets * sizeof(struct bucket) + + (u64) htab->elem_size * htab->map.max_entries; + + if (percpu) + cost += (u64) round_up(htab->map.value_size, 8) * + num_possible_cpus() * htab->map.max_entries; + + if (cost >= U32_MAX - PAGE_SIZE) + /* make sure page count doesn't overflow */ + goto free_htab; + + htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + /* if map size is larger than memlock limit, reject it early */ + err = bpf_map_precharge_memlock(htab->map.pages); + if (err) goto free_htab; - htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct hlist_head), + err = -ENOMEM; + htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket), GFP_USER | __GFP_NOWARN); if (!htab->buckets) { - htab->buckets = vmalloc(htab->n_buckets * sizeof(struct hlist_head)); + htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket)); if (!htab->buckets) goto free_htab; } - for (i = 0; i < htab->n_buckets; i++) - INIT_HLIST_HEAD(&htab->buckets[i]); - - raw_spin_lock_init(&htab->lock); - htab->count = 0; + for (i = 0; i < htab->n_buckets; i++) { + INIT_HLIST_HEAD(&htab->buckets[i].head); + raw_spin_lock_init(&htab->buckets[i].lock); + } - htab->elem_size = sizeof(struct htab_elem) + - round_up(htab->map.key_size, 8) + - htab->map.value_size; + if (!(attr->map_flags & BPF_F_NO_PREALLOC)) { + err = prealloc_elems_and_freelist(htab); + if (err) + goto free_buckets; + } - htab->map.pages = round_up(htab->n_buckets * sizeof(struct hlist_head) + - htab->elem_size * htab->map.max_entries, - PAGE_SIZE) >> PAGE_SHIFT; return &htab->map; +free_buckets: + kvfree(htab->buckets); free_htab: kfree(htab); return ERR_PTR(err); @@ -104,11 +232,16 @@ static inline u32 htab_map_hash(const void *key, u32 key_len) return jhash(key, key_len, 0); } -static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash) { return &htab->buckets[hash & (htab->n_buckets - 1)]; } +static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash) +{ + return &__select_bucket(htab, hash)->head; +} + static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, void *key, u32 key_size) { @@ -122,7 +255,7 @@ static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash, } /* Called from syscall or from eBPF program */ -static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +static void *__htab_map_lookup_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; @@ -140,6 +273,13 @@ static void *htab_map_lookup_elem(struct bpf_map *map, void *key) l = lookup_elem_raw(head, hash, key, key_size); + return l; +} + +static void *htab_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + if (l) return l->key + round_up(map->key_size, 8); @@ -200,90 +340,254 @@ find_first_elem: } } - /* itereated over all buckets and all elements */ + /* iterated over all buckets and all elements */ return -ENOENT; } +static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l) +{ + if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) + free_percpu(htab_elem_get_ptr(l, htab->map.key_size)); + kfree(l); + +} + +static void htab_elem_free_rcu(struct rcu_head *head) +{ + struct htab_elem *l = container_of(head, struct htab_elem, rcu); + struct bpf_htab *htab = l->htab; + + /* must increment bpf_prog_active to avoid kprobe+bpf triggering while + * we're calling kfree, otherwise deadlock is possible if kprobes + * are placed somewhere inside of slub + */ + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + htab_elem_free(htab, l); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); +} + +static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l) +{ + if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) { + pcpu_freelist_push(&htab->freelist, &l->fnode); + } else { + atomic_dec(&htab->count); + l->htab = htab; + call_rcu(&l->rcu, htab_elem_free_rcu); + } +} + +static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key, + void *value, u32 key_size, u32 hash, + bool percpu, bool onallcpus) +{ + u32 size = htab->map.value_size; + bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC); + struct htab_elem *l_new; + void __percpu *pptr; + + if (prealloc) { + l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist); + if (!l_new) + return ERR_PTR(-E2BIG); + } else { + if (atomic_inc_return(&htab->count) > htab->map.max_entries) { + atomic_dec(&htab->count); + return ERR_PTR(-E2BIG); + } + l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN); + if (!l_new) + return ERR_PTR(-ENOMEM); + } + + memcpy(l_new->key, key, key_size); + if (percpu) { + /* round up value_size to 8 bytes */ + size = round_up(size, 8); + + if (prealloc) { + pptr = htab_elem_get_ptr(l_new, key_size); + } else { + /* alloc_percpu zero-fills */ + pptr = __alloc_percpu_gfp(size, 8, + GFP_ATOMIC | __GFP_NOWARN); + if (!pptr) { + kfree(l_new); + return ERR_PTR(-ENOMEM); + } + } + + if (!onallcpus) { + /* copy true value_size bytes */ + memcpy(this_cpu_ptr(pptr), value, htab->map.value_size); + } else { + int off = 0, cpu; + + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + if (!prealloc) + htab_elem_set_ptr(l_new, key_size, pptr); + } else { + memcpy(l_new->key + round_up(key_size, 8), value, size); + } + + l_new->hash = hash; + return l_new; +} + +static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old, + u64 map_flags) +{ + if (l_old && map_flags == BPF_NOEXIST) + /* elem already exists */ + return -EEXIST; + + if (!l_old && map_flags == BPF_EXIST) + /* elem doesn't exist, cannot update it */ + return -ENOENT; + + return 0; +} + /* Called from syscall or from eBPF program */ static int htab_map_update_elem(struct bpf_map *map, void *key, void *value, u64 map_flags) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); - struct htab_elem *l_new, *l_old; + struct htab_elem *l_new = NULL, *l_old; struct hlist_head *head; unsigned long flags; - u32 key_size; + struct bucket *b; + u32 key_size, hash; int ret; - if (map_flags > BPF_EXIST) + if (unlikely(map_flags > BPF_EXIST)) /* unknown flags */ return -EINVAL; WARN_ON_ONCE(!rcu_read_lock_held()); - /* allocate new element outside of lock */ - l_new = kmalloc(htab->elem_size, GFP_ATOMIC); - if (!l_new) - return -ENOMEM; - key_size = map->key_size; - memcpy(l_new->key, key, key_size); - memcpy(l_new->key + round_up(key_size, 8), value, map->value_size); + hash = htab_map_hash(key, key_size); - l_new->hash = htab_map_hash(l_new->key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; /* bpf_map_update_elem() can be called in_irq() */ - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, l_new->hash); + raw_spin_lock_irqsave(&b->lock, flags); - l_old = lookup_elem_raw(head, l_new->hash, key, key_size); + l_old = lookup_elem_raw(head, hash, key, key_size); - if (!l_old && unlikely(htab->count >= map->max_entries)) { - /* if elem with this 'key' doesn't exist and we've reached - * max_entries limit, fail insertion of new elem - */ - ret = -E2BIG; + ret = check_flags(htab, l_old, map_flags); + if (ret) goto err; - } - - if (l_old && map_flags == BPF_NOEXIST) { - /* elem already exists */ - ret = -EEXIST; - goto err; - } - if (!l_old && map_flags == BPF_EXIST) { - /* elem doesn't exist, cannot update it */ - ret = -ENOENT; + l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false); + if (IS_ERR(l_new)) { + /* all pre-allocated elements are in use or memory exhausted */ + ret = PTR_ERR(l_new); goto err; } - /* add new element to the head of the list, so that concurrent - * search will find it before old elem + /* add new element to the head of the list, so that + * concurrent search will find it before old elem */ hlist_add_head_rcu(&l_new->hash_node, head); if (l_old) { hlist_del_rcu(&l_old->hash_node); - kfree_rcu(l_old, rcu); - } else { - htab->count++; + free_htab_elem(htab, l_old); } - raw_spin_unlock_irqrestore(&htab->lock, flags); + ret = 0; +err: + raw_spin_unlock_irqrestore(&b->lock, flags); + return ret; +} - return 0; +static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags, + bool onallcpus) +{ + struct bpf_htab *htab = container_of(map, struct bpf_htab, map); + struct htab_elem *l_new = NULL, *l_old; + struct hlist_head *head; + unsigned long flags; + struct bucket *b; + u32 key_size, hash; + int ret; + + if (unlikely(map_flags > BPF_EXIST)) + /* unknown flags */ + return -EINVAL; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + key_size = map->key_size; + + hash = htab_map_hash(key, key_size); + + b = __select_bucket(htab, hash); + head = &b->head; + + /* bpf_map_update_elem() can be called in_irq() */ + raw_spin_lock_irqsave(&b->lock, flags); + + l_old = lookup_elem_raw(head, hash, key, key_size); + + ret = check_flags(htab, l_old, map_flags); + if (ret) + goto err; + + if (l_old) { + void __percpu *pptr = htab_elem_get_ptr(l_old, key_size); + u32 size = htab->map.value_size; + + /* per-cpu hash map can update value in-place */ + if (!onallcpus) { + memcpy(this_cpu_ptr(pptr), value, size); + } else { + int off = 0, cpu; + + size = round_up(size, 8); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(per_cpu_ptr(pptr, cpu), + value + off, size); + off += size; + } + } + } else { + l_new = alloc_htab_elem(htab, key, value, key_size, + hash, true, onallcpus); + if (IS_ERR(l_new)) { + ret = PTR_ERR(l_new); + goto err; + } + hlist_add_head_rcu(&l_new->hash_node, head); + } + ret = 0; err: - raw_spin_unlock_irqrestore(&htab->lock, flags); - kfree(l_new); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } +static int htab_percpu_map_update_elem(struct bpf_map *map, void *key, + void *value, u64 map_flags) +{ + return __htab_percpu_map_update_elem(map, key, value, map_flags, false); +} + /* Called from syscall or from eBPF program */ static int htab_map_delete_elem(struct bpf_map *map, void *key) { struct bpf_htab *htab = container_of(map, struct bpf_htab, map); struct hlist_head *head; + struct bucket *b; struct htab_elem *l; unsigned long flags; u32 hash, key_size; @@ -294,21 +598,20 @@ static int htab_map_delete_elem(struct bpf_map *map, void *key) key_size = map->key_size; hash = htab_map_hash(key, key_size); + b = __select_bucket(htab, hash); + head = &b->head; - raw_spin_lock_irqsave(&htab->lock, flags); - - head = select_bucket(htab, hash); + raw_spin_lock_irqsave(&b->lock, flags); l = lookup_elem_raw(head, hash, key, key_size); if (l) { hlist_del_rcu(&l->hash_node); - htab->count--; - kfree_rcu(l, rcu); + free_htab_elem(htab, l); ret = 0; } - raw_spin_unlock_irqrestore(&htab->lock, flags); + raw_spin_unlock_irqrestore(&b->lock, flags); return ret; } @@ -323,12 +626,10 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab->count--; - kfree(l); + htab_elem_free(htab, l); } } } - /* Called when map->refcnt goes to zero, either from workqueue or from syscall */ static void htab_map_free(struct bpf_map *map) { @@ -341,10 +642,16 @@ static void htab_map_free(struct bpf_map *map) */ synchronize_rcu(); - /* some of kfree_rcu() callbacks for elements of this map may not have - * executed. It's ok. Proceed to free residual elements and map itself + /* some of free_htab_elem() callbacks for elements of this map may + * not have executed. Wait for them. */ - delete_all_elements(htab); + rcu_barrier(); + if (htab->map.map_flags & BPF_F_NO_PREALLOC) { + delete_all_elements(htab); + } else { + htab_free_elems(htab); + pcpu_freelist_destroy(&htab->freelist); + } kvfree(htab->buckets); kfree(htab); } @@ -363,9 +670,76 @@ static struct bpf_map_type_list htab_type __read_mostly = { .type = BPF_MAP_TYPE_HASH, }; +/* Called from eBPF program */ +static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key) +{ + struct htab_elem *l = __htab_map_lookup_elem(map, key); + + if (l) + return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size)); + else + return NULL; +} + +int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value) +{ + struct htab_elem *l; + void __percpu *pptr; + int ret = -ENOENT; + int cpu, off = 0; + u32 size; + + /* per_cpu areas are zero-filled and bpf programs can only + * access 'value_size' of them, so copying rounded areas + * will not leak any kernel data + */ + size = round_up(map->value_size, 8); + rcu_read_lock(); + l = __htab_map_lookup_elem(map, key); + if (!l) + goto out; + pptr = htab_elem_get_ptr(l, map->key_size); + for_each_possible_cpu(cpu) { + bpf_long_memcpy(value + off, + per_cpu_ptr(pptr, cpu), size); + off += size; + } + ret = 0; +out: + rcu_read_unlock(); + return ret; +} + +int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + int ret; + + rcu_read_lock(); + ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true); + rcu_read_unlock(); + + return ret; +} + +static const struct bpf_map_ops htab_percpu_ops = { + .map_alloc = htab_map_alloc, + .map_free = htab_map_free, + .map_get_next_key = htab_map_get_next_key, + .map_lookup_elem = htab_percpu_map_lookup_elem, + .map_update_elem = htab_percpu_map_update_elem, + .map_delete_elem = htab_map_delete_elem, +}; + +static struct bpf_map_type_list htab_percpu_type __read_mostly = { + .ops = &htab_percpu_ops, + .type = BPF_MAP_TYPE_PERCPU_HASH, +}; + static int __init register_htab_map(void) { bpf_register_map_type(&htab_type); + bpf_register_map_type(&htab_percpu_type); return 0; } late_initcall(register_htab_map); diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c index 4504ca66118d..50da680c479f 100644 --- a/kernel/bpf/helpers.c +++ b/kernel/bpf/helpers.c @@ -166,7 +166,7 @@ static u64 bpf_get_current_comm(u64 r1, u64 size, u64 r3, u64 r4, u64 r5) if (!task) return -EINVAL; - memcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); + strlcpy(buf, task->comm, min_t(size_t, size, sizeof(task->comm))); return 0; } diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index be6d726e31c9..f2ece3c174a5 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -34,7 +34,7 @@ static void *bpf_any_get(void *raw, enum bpf_type type) atomic_inc(&((struct bpf_prog *)raw)->aux->refcnt); break; case BPF_TYPE_MAP: - atomic_inc(&((struct bpf_map *)raw)->refcnt); + bpf_map_inc(raw, true); break; default: WARN_ON_ONCE(1); @@ -51,7 +51,7 @@ static void bpf_any_put(void *raw, enum bpf_type type) bpf_prog_put(raw); break; case BPF_TYPE_MAP: - bpf_map_put(raw); + bpf_map_put_with_uref(raw); break; default: WARN_ON_ONCE(1); @@ -64,7 +64,7 @@ static void *bpf_fd_probe_obj(u32 ufd, enum bpf_type *type) void *raw; *type = BPF_TYPE_MAP; - raw = bpf_map_get(ufd); + raw = bpf_map_get_with_uref(ufd); if (IS_ERR(raw)) { *type = BPF_TYPE_PROG; raw = bpf_prog_get(ufd); @@ -187,11 +187,31 @@ static int bpf_mkobj(struct inode *dir, struct dentry *dentry, umode_t mode, } } +static int bpf_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_link(old_dentry, dir, new_dentry); +} + +static int bpf_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + if (bpf_dname_reserved(new_dentry)) + return -EPERM; + + return simple_rename(old_dir, old_dentry, new_dir, new_dentry); +} + static const struct inode_operations bpf_dir_iops = { .lookup = simple_lookup, .mknod = bpf_mkobj, .mkdir = bpf_mkdir, .rmdir = simple_rmdir, + .rename = bpf_rename, + .link = bpf_link, .unlink = simple_unlink, }; diff --git a/kernel/bpf/percpu_freelist.c b/kernel/bpf/percpu_freelist.c new file mode 100644 index 000000000000..5c51d1985b51 --- /dev/null +++ b/kernel/bpf/percpu_freelist.c @@ -0,0 +1,100 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include "percpu_freelist.h" + +int pcpu_freelist_init(struct pcpu_freelist *s) +{ + int cpu; + + s->freelist = alloc_percpu(struct pcpu_freelist_head); + if (!s->freelist) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + struct pcpu_freelist_head *head = per_cpu_ptr(s->freelist, cpu); + + raw_spin_lock_init(&head->lock); + head->first = NULL; + } + return 0; +} + +void pcpu_freelist_destroy(struct pcpu_freelist *s) +{ + free_percpu(s->freelist); +} + +static inline void __pcpu_freelist_push(struct pcpu_freelist_head *head, + struct pcpu_freelist_node *node) +{ + raw_spin_lock(&head->lock); + node->next = head->first; + head->first = node; + raw_spin_unlock(&head->lock); +} + +void pcpu_freelist_push(struct pcpu_freelist *s, + struct pcpu_freelist_node *node) +{ + struct pcpu_freelist_head *head = this_cpu_ptr(s->freelist); + + __pcpu_freelist_push(head, node); +} + +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems) +{ + struct pcpu_freelist_head *head; + unsigned long flags; + int i, cpu, pcpu_entries; + + pcpu_entries = nr_elems / num_possible_cpus() + 1; + i = 0; + + /* disable irq to workaround lockdep false positive + * in bpf usage pcpu_freelist_populate() will never race + * with pcpu_freelist_push() + */ + local_irq_save(flags); + for_each_possible_cpu(cpu) { +again: + head = per_cpu_ptr(s->freelist, cpu); + __pcpu_freelist_push(head, buf); + i++; + buf += elem_size; + if (i == nr_elems) + break; + if (i % pcpu_entries) + goto again; + } + local_irq_restore(flags); +} + +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *s) +{ + struct pcpu_freelist_head *head; + struct pcpu_freelist_node *node; + int orig_cpu, cpu; + + orig_cpu = cpu = raw_smp_processor_id(); + while (1) { + head = per_cpu_ptr(s->freelist, cpu); + raw_spin_lock(&head->lock); + node = head->first; + if (node) { + head->first = node->next; + raw_spin_unlock(&head->lock); + return node; + } + raw_spin_unlock(&head->lock); + cpu = cpumask_next(cpu, cpu_possible_mask); + if (cpu >= nr_cpu_ids) + cpu = 0; + if (cpu == orig_cpu) + return NULL; + } +} diff --git a/kernel/bpf/percpu_freelist.h b/kernel/bpf/percpu_freelist.h new file mode 100644 index 000000000000..3049aae8ea1e --- /dev/null +++ b/kernel/bpf/percpu_freelist.h @@ -0,0 +1,31 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#ifndef __PERCPU_FREELIST_H__ +#define __PERCPU_FREELIST_H__ +#include <linux/spinlock.h> +#include <linux/percpu.h> + +struct pcpu_freelist_head { + struct pcpu_freelist_node *first; + raw_spinlock_t lock; +}; + +struct pcpu_freelist { + struct pcpu_freelist_head __percpu *freelist; +}; + +struct pcpu_freelist_node { + struct pcpu_freelist_node *next; +}; + +void pcpu_freelist_push(struct pcpu_freelist *, struct pcpu_freelist_node *); +struct pcpu_freelist_node *pcpu_freelist_pop(struct pcpu_freelist *); +void pcpu_freelist_populate(struct pcpu_freelist *s, void *buf, u32 elem_size, + u32 nr_elems); +int pcpu_freelist_init(struct pcpu_freelist *); +void pcpu_freelist_destroy(struct pcpu_freelist *s); +#endif diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c new file mode 100644 index 000000000000..499d9e933f8e --- /dev/null +++ b/kernel/bpf/stackmap.c @@ -0,0 +1,290 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/bpf.h> +#include <linux/jhash.h> +#include <linux/filter.h> +#include <linux/vmalloc.h> +#include <linux/stacktrace.h> +#include <linux/perf_event.h> +#include "percpu_freelist.h" + +struct stack_map_bucket { + struct pcpu_freelist_node fnode; + u32 hash; + u32 nr; + u64 ip[]; +}; + +struct bpf_stack_map { + struct bpf_map map; + void *elems; + struct pcpu_freelist freelist; + u32 n_buckets; + struct stack_map_bucket *buckets[]; +}; + +static int prealloc_elems_and_freelist(struct bpf_stack_map *smap) +{ + u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size; + int err; + + smap->elems = vzalloc(elem_size * smap->map.max_entries); + if (!smap->elems) + return -ENOMEM; + + err = pcpu_freelist_init(&smap->freelist); + if (err) + goto free_elems; + + pcpu_freelist_populate(&smap->freelist, smap->elems, elem_size, + smap->map.max_entries); + return 0; + +free_elems: + vfree(smap->elems); + return err; +} + +/* Called from syscall */ +static struct bpf_map *stack_map_alloc(union bpf_attr *attr) +{ + u32 value_size = attr->value_size; + struct bpf_stack_map *smap; + u64 cost, n_buckets; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + if (attr->map_flags) + return ERR_PTR(-EINVAL); + + /* check sanity of attributes */ + if (attr->max_entries == 0 || attr->key_size != 4 || + value_size < 8 || value_size % 8 || + value_size / 8 > PERF_MAX_STACK_DEPTH) + return ERR_PTR(-EINVAL); + + /* hash table size must be power of 2 */ + n_buckets = roundup_pow_of_two(attr->max_entries); + + cost = n_buckets * sizeof(struct stack_map_bucket *) + sizeof(*smap); + if (cost >= U32_MAX - PAGE_SIZE) + return ERR_PTR(-E2BIG); + + smap = kzalloc(cost, GFP_USER | __GFP_NOWARN); + if (!smap) { + smap = vzalloc(cost); + if (!smap) + return ERR_PTR(-ENOMEM); + } + + err = -E2BIG; + cost += n_buckets * (value_size + sizeof(struct stack_map_bucket)); + if (cost >= U32_MAX - PAGE_SIZE) + goto free_smap; + + smap->map.map_type = attr->map_type; + smap->map.key_size = attr->key_size; + smap->map.value_size = value_size; + smap->map.max_entries = attr->max_entries; + smap->n_buckets = n_buckets; + smap->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT; + + err = bpf_map_precharge_memlock(smap->map.pages); + if (err) + goto free_smap; + + err = get_callchain_buffers(); + if (err) + goto free_smap; + + err = prealloc_elems_and_freelist(smap); + if (err) + goto put_buffers; + + return &smap->map; + +put_buffers: + put_callchain_buffers(); +free_smap: + kvfree(smap); + return ERR_PTR(err); +} + +static u64 bpf_get_stackid(u64 r1, u64 r2, u64 flags, u64 r4, u64 r5) +{ + struct pt_regs *regs = (struct pt_regs *) (long) r1; + struct bpf_map *map = (struct bpf_map *) (long) r2; + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct perf_callchain_entry *trace; + struct stack_map_bucket *bucket, *new_bucket, *old_bucket; + u32 max_depth = map->value_size / 8; + /* stack_map_alloc() checks that max_depth <= PERF_MAX_STACK_DEPTH */ + u32 init_nr = PERF_MAX_STACK_DEPTH - max_depth; + u32 skip = flags & BPF_F_SKIP_FIELD_MASK; + u32 hash, id, trace_nr, trace_len; + bool user = flags & BPF_F_USER_STACK; + bool kernel = !user; + u64 *ips; + + if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK | + BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID))) + return -EINVAL; + + trace = get_perf_callchain(regs, init_nr, kernel, user, false, false); + + if (unlikely(!trace)) + /* couldn't fetch the stack trace */ + return -EFAULT; + + /* get_perf_callchain() guarantees that trace->nr >= init_nr + * and trace-nr <= PERF_MAX_STACK_DEPTH, so trace_nr <= max_depth + */ + trace_nr = trace->nr - init_nr; + + if (trace_nr <= skip) + /* skipping more than usable stack trace */ + return -EFAULT; + + trace_nr -= skip; + trace_len = trace_nr * sizeof(u64); + ips = trace->ip + skip + init_nr; + hash = jhash2((u32 *)ips, trace_len / sizeof(u32), 0); + id = hash & (smap->n_buckets - 1); + bucket = READ_ONCE(smap->buckets[id]); + + if (bucket && bucket->hash == hash) { + if (flags & BPF_F_FAST_STACK_CMP) + return id; + if (bucket->nr == trace_nr && + memcmp(bucket->ip, ips, trace_len) == 0) + return id; + } + + /* this call stack is not in the map, try to add it */ + if (bucket && !(flags & BPF_F_REUSE_STACKID)) + return -EEXIST; + + new_bucket = (struct stack_map_bucket *) + pcpu_freelist_pop(&smap->freelist); + if (unlikely(!new_bucket)) + return -ENOMEM; + + memcpy(new_bucket->ip, ips, trace_len); + new_bucket->hash = hash; + new_bucket->nr = trace_nr; + + old_bucket = xchg(&smap->buckets[id], new_bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return id; +} + +const struct bpf_func_proto bpf_get_stackid_proto = { + .func = bpf_get_stackid, + .gpl_only = true, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_CONST_MAP_PTR, + .arg3_type = ARG_ANYTHING, +}; + +/* Called from eBPF program */ +static void *stack_map_lookup_elem(struct bpf_map *map, void *key) +{ + return NULL; +} + +/* Called from syscall */ +int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *bucket, *old_bucket; + u32 id = *(u32 *)key, trace_len; + + if (unlikely(id >= smap->n_buckets)) + return -ENOENT; + + bucket = xchg(&smap->buckets[id], NULL); + if (!bucket) + return -ENOENT; + + trace_len = bucket->nr * sizeof(u64); + memcpy(value, bucket->ip, trace_len); + memset(value + trace_len, 0, map->value_size - trace_len); + + old_bucket = xchg(&smap->buckets[id], bucket); + if (old_bucket) + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; +} + +static int stack_map_get_next_key(struct bpf_map *map, void *key, void *next_key) +{ + return -EINVAL; +} + +static int stack_map_update_elem(struct bpf_map *map, void *key, void *value, + u64 map_flags) +{ + return -EINVAL; +} + +/* Called from syscall or from eBPF program */ +static int stack_map_delete_elem(struct bpf_map *map, void *key) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + struct stack_map_bucket *old_bucket; + u32 id = *(u32 *)key; + + if (unlikely(id >= smap->n_buckets)) + return -E2BIG; + + old_bucket = xchg(&smap->buckets[id], NULL); + if (old_bucket) { + pcpu_freelist_push(&smap->freelist, &old_bucket->fnode); + return 0; + } else { + return -ENOENT; + } +} + +/* Called when map->refcnt goes to zero, either from workqueue or from syscall */ +static void stack_map_free(struct bpf_map *map) +{ + struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map); + + /* wait for bpf programs to complete before freeing stack map */ + synchronize_rcu(); + + vfree(smap->elems); + pcpu_freelist_destroy(&smap->freelist); + kvfree(smap); + put_callchain_buffers(); +} + +static const struct bpf_map_ops stack_map_ops = { + .map_alloc = stack_map_alloc, + .map_free = stack_map_free, + .map_get_next_key = stack_map_get_next_key, + .map_lookup_elem = stack_map_lookup_elem, + .map_update_elem = stack_map_update_elem, + .map_delete_elem = stack_map_delete_elem, +}; + +static struct bpf_map_type_list stack_map_type __read_mostly = { + .ops = &stack_map_ops, + .type = BPF_MAP_TYPE_STACK_TRACE, +}; + +static int __init register_stack_map(void) +{ + bpf_register_map_type(&stack_map_type); + return 0; +} +late_initcall(register_stack_map); diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 0d3313d02a7e..adc5e4bd74f8 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -18,6 +18,8 @@ #include <linux/filter.h> #include <linux/version.h> +DEFINE_PER_CPU(int, bpf_prog_active); + int sysctl_unprivileged_bpf_disabled __read_mostly; static LIST_HEAD(bpf_map_types); @@ -46,6 +48,19 @@ void bpf_register_map_type(struct bpf_map_type_list *tl) list_add(&tl->list_node, &bpf_map_types); } +int bpf_map_precharge_memlock(u32 pages) +{ + struct user_struct *user = get_current_user(); + unsigned long memlock_limit, cur; + + memlock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; + cur = atomic_long_read(&user->locked_vm); + free_uid(user); + if (cur + pages > memlock_limit) + return -EPERM; + return 0; +} + static int bpf_map_charge_memlock(struct bpf_map *map) { struct user_struct *user = get_current_user(); @@ -82,6 +97,14 @@ static void bpf_map_free_deferred(struct work_struct *work) map->ops->map_free(map); } +static void bpf_map_put_uref(struct bpf_map *map) +{ + if (atomic_dec_and_test(&map->usercnt)) { + if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) + bpf_fd_array_map_clear(map); + } +} + /* decrement map refcnt and schedule it for freeing via workqueue * (unrelying map implementation ops->map_free() might sleep) */ @@ -93,22 +116,42 @@ void bpf_map_put(struct bpf_map *map) } } +void bpf_map_put_with_uref(struct bpf_map *map) +{ + bpf_map_put_uref(map); + bpf_map_put(map); +} + static int bpf_map_release(struct inode *inode, struct file *filp) { - struct bpf_map *map = filp->private_data; + bpf_map_put_with_uref(filp->private_data); + return 0; +} - if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY) - /* prog_array stores refcnt-ed bpf_prog pointers - * release them all when user space closes prog_array_fd - */ - bpf_fd_array_map_clear(map); +#ifdef CONFIG_PROC_FS +static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp) +{ + const struct bpf_map *map = filp->private_data; - bpf_map_put(map); - return 0; + seq_printf(m, + "map_type:\t%u\n" + "key_size:\t%u\n" + "value_size:\t%u\n" + "max_entries:\t%u\n" + "map_flags:\t%#x\n", + map->map_type, + map->key_size, + map->value_size, + map->max_entries, + map->map_flags); } +#endif static const struct file_operations bpf_map_fops = { - .release = bpf_map_release, +#ifdef CONFIG_PROC_FS + .show_fdinfo = bpf_map_show_fdinfo, +#endif + .release = bpf_map_release, }; int bpf_map_new_fd(struct bpf_map *map) @@ -125,7 +168,7 @@ int bpf_map_new_fd(struct bpf_map *map) offsetof(union bpf_attr, CMD##_LAST_FIELD) - \ sizeof(attr->CMD##_LAST_FIELD)) != NULL -#define BPF_MAP_CREATE_LAST_FIELD max_entries +#define BPF_MAP_CREATE_LAST_FIELD map_flags /* called via syscall */ static int map_create(union bpf_attr *attr) { @@ -142,6 +185,7 @@ static int map_create(union bpf_attr *attr) return PTR_ERR(map); atomic_set(&map->refcnt, 1); + atomic_set(&map->usercnt, 1); err = bpf_map_charge_memlock(map); if (err) @@ -174,7 +218,14 @@ struct bpf_map *__bpf_map_get(struct fd f) return f.file->private_data; } -struct bpf_map *bpf_map_get(u32 ufd) +void bpf_map_inc(struct bpf_map *map, bool uref) +{ + atomic_inc(&map->refcnt); + if (uref) + atomic_inc(&map->usercnt); +} + +struct bpf_map *bpf_map_get_with_uref(u32 ufd) { struct fd f = fdget(ufd); struct bpf_map *map; @@ -183,7 +234,7 @@ struct bpf_map *bpf_map_get(u32 ufd) if (IS_ERR(map)) return map; - atomic_inc(&map->refcnt); + bpf_map_inc(map, true); fdput(f); return map; @@ -195,6 +246,11 @@ static void __user *u64_to_ptr(__u64 val) return (void __user *) (unsigned long) val; } +int __weak bpf_stackmap_copy(struct bpf_map *map, void *key, void *value) +{ + return -ENOTSUPP; +} + /* last field in 'union bpf_attr' used by this command */ #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value @@ -205,6 +261,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value, *ptr; + u32 value_size; struct fd f; int err; @@ -225,23 +282,37 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; - rcu_read_lock(); - ptr = map->ops->map_lookup_elem(map, key); - if (ptr) - memcpy(value, ptr, map->value_size); - rcu_read_unlock(); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_copy(map, key, value); + } else if (map->map_type == BPF_MAP_TYPE_STACK_TRACE) { + err = bpf_stackmap_copy(map, key, value); + } else { + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, value_size); + rcu_read_unlock(); + err = ptr ? 0 : -ENOENT; + } - err = -ENOENT; - if (!ptr) + if (err) goto free_value; err = -EFAULT; - if (copy_to_user(uvalue, value, map->value_size) != 0) + if (copy_to_user(uvalue, value, value_size) != 0) goto free_value; err = 0; @@ -264,6 +335,7 @@ static int map_update_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct bpf_map *map; void *key, *value; + u32 value_size; struct fd f; int err; @@ -284,21 +356,37 @@ static int map_update_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH || + map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) + value_size = round_up(map->value_size, 8) * num_possible_cpus(); + else + value_size = map->value_size; + err = -ENOMEM; - value = kmalloc(map->value_size, GFP_USER); + value = kmalloc(value_size, GFP_USER | __GFP_NOWARN); if (!value) goto free_key; err = -EFAULT; - if (copy_from_user(value, uvalue, map->value_size) != 0) + if (copy_from_user(value, uvalue, value_size) != 0) goto free_value; - /* eBPF program that use maps are running under rcu_read_lock(), - * therefore all map accessors rely on this fact, so do the same here + /* must increment bpf_prog_active to avoid kprobe+bpf triggering from + * inside bpf map update or delete otherwise deadlocks are possible */ - rcu_read_lock(); - err = map->ops->map_update_elem(map, key, value, attr->flags); - rcu_read_unlock(); + preempt_disable(); + __this_cpu_inc(bpf_prog_active); + if (map->map_type == BPF_MAP_TYPE_PERCPU_HASH) { + err = bpf_percpu_hash_update(map, key, value, attr->flags); + } else if (map->map_type == BPF_MAP_TYPE_PERCPU_ARRAY) { + err = bpf_percpu_array_update(map, key, value, attr->flags); + } else { + rcu_read_lock(); + err = map->ops->map_update_elem(map, key, value, attr->flags); + rcu_read_unlock(); + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_value: kfree(value); @@ -337,9 +425,13 @@ static int map_delete_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; + preempt_disable(); + __this_cpu_inc(bpf_prog_active); rcu_read_lock(); err = map->ops->map_delete_elem(map, key); rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); free_key: kfree(key); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index c6073056badf..2e08f8e9b771 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -246,6 +246,7 @@ static const struct { {BPF_MAP_TYPE_PROG_ARRAY, BPF_FUNC_tail_call}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_read}, {BPF_MAP_TYPE_PERF_EVENT_ARRAY, BPF_FUNC_perf_event_output}, + {BPF_MAP_TYPE_STACK_TRACE, BPF_FUNC_get_stackid}, }; static void print_verifier_state(struct verifier_env *env) @@ -778,15 +779,24 @@ static int check_xadd(struct verifier_env *env, struct bpf_insn *insn) * bytes from that pointer, make sure that it's within stack boundary * and all elements of stack are initialized */ -static int check_stack_boundary(struct verifier_env *env, - int regno, int access_size) +static int check_stack_boundary(struct verifier_env *env, int regno, + int access_size, bool zero_size_allowed) { struct verifier_state *state = &env->cur_state; struct reg_state *regs = state->regs; int off, i; - if (regs[regno].type != PTR_TO_STACK) + if (regs[regno].type != PTR_TO_STACK) { + if (zero_size_allowed && access_size == 0 && + regs[regno].type == CONST_IMM && + regs[regno].imm == 0) + return 0; + + verbose("R%d type=%s expected=%s\n", regno, + reg_type_str[regs[regno].type], + reg_type_str[PTR_TO_STACK]); return -EACCES; + } off = regs[regno].imm; if (off >= 0 || off < -MAX_BPF_STACK || off + access_size > 0 || @@ -829,15 +839,24 @@ static int check_func_arg(struct verifier_env *env, u32 regno, return 0; } - if (arg_type == ARG_PTR_TO_STACK || arg_type == ARG_PTR_TO_MAP_KEY || + if (arg_type == ARG_PTR_TO_MAP_KEY || arg_type == ARG_PTR_TO_MAP_VALUE) { expected_type = PTR_TO_STACK; - } else if (arg_type == ARG_CONST_STACK_SIZE) { + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { expected_type = CONST_IMM; } else if (arg_type == ARG_CONST_MAP_PTR) { expected_type = CONST_PTR_TO_MAP; } else if (arg_type == ARG_PTR_TO_CTX) { expected_type = PTR_TO_CTX; + } else if (arg_type == ARG_PTR_TO_STACK) { + expected_type = PTR_TO_STACK; + /* One exception here. In case function allows for NULL to be + * passed in as argument, it's a CONST_IMM type. Final test + * happens during stack boundary checking. + */ + if (reg->type == CONST_IMM && reg->imm == 0) + expected_type = CONST_IMM; } else { verbose("unsupported arg_type %d\n", arg_type); return -EFAULT; @@ -867,8 +886,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->key\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->key_size); - + err = check_stack_boundary(env, regno, (*mapp)->key_size, + false); } else if (arg_type == ARG_PTR_TO_MAP_VALUE) { /* bpf_map_xxx(..., map_ptr, ..., value) call: * check [value, value + map->value_size) validity @@ -878,9 +897,12 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("invalid map_ptr to access map->value\n"); return -EACCES; } - err = check_stack_boundary(env, regno, (*mapp)->value_size); + err = check_stack_boundary(env, regno, (*mapp)->value_size, + false); + } else if (arg_type == ARG_CONST_STACK_SIZE || + arg_type == ARG_CONST_STACK_SIZE_OR_ZERO) { + bool zero_size_allowed = (arg_type == ARG_CONST_STACK_SIZE_OR_ZERO); - } else if (arg_type == ARG_CONST_STACK_SIZE) { /* bpf_xxx(..., buf, len) call will access 'len' bytes * from stack pointer 'buf'. Check it * note: regno == len, regno - 1 == buf @@ -890,7 +912,8 @@ static int check_func_arg(struct verifier_env *env, u32 regno, verbose("ARG_CONST_STACK_SIZE cannot be first argument\n"); return -EACCES; } - err = check_stack_boundary(env, regno - 1, reg->imm); + err = check_stack_boundary(env, regno - 1, reg->imm, + zero_size_allowed); } return err; @@ -911,8 +934,11 @@ static int check_map_func_compatibility(struct bpf_map *map, int func_id) * don't allow any other map type to be passed into * the special func; */ - if (bool_func && bool_map != bool_func) + if (bool_func && bool_map != bool_func) { + verbose("cannot pass map_type %d into func %d\n", + map->map_type, func_id); return -EINVAL; + } } return 0; @@ -1121,6 +1147,16 @@ static int check_alu_op(struct verifier_env *env, struct bpf_insn *insn) return -EINVAL; } + if ((opcode == BPF_LSH || opcode == BPF_RSH || + opcode == BPF_ARSH) && BPF_SRC(insn->code) == BPF_K) { + int size = BPF_CLASS(insn->code) == BPF_ALU64 ? 64 : 32; + + if (insn->imm < 0 || insn->imm >= size) { + verbose("invalid shift %d\n", insn->imm); + return -EINVAL; + } + } + /* pattern match 'bpf_add Rx, imm' instruction */ if (opcode == BPF_ADD && BPF_CLASS(insn->code) == BPF_ALU64 && regs[insn->dst_reg].type == FRAME_PTR && @@ -2021,8 +2057,7 @@ static int replace_map_fd_with_map_ptr(struct verifier_env *env) * will be used by the valid program until it's unloaded * and all maps are released in free_bpf_prog_info() */ - atomic_inc(&map->refcnt); - + bpf_map_inc(map, false); fdput(f); next_insn: insn++; @@ -2073,7 +2108,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index b9d0cce3f9ce..671dc05c0b0f 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -57,8 +57,12 @@ #include <linux/vmalloc.h> /* TODO: replace with more sophisticated array */ #include <linux/kthread.h> #include <linux/delay.h> - #include <linux/atomic.h> +#include <linux/cpuset.h> +#include <linux/proc_ns.h> +#include <linux/nsproxy.h> +#include <linux/proc_ns.h> +#include <net/sock.h> /* * pidlists linger the following amount before being destroyed. The goal @@ -98,6 +102,12 @@ static DEFINE_SPINLOCK(css_set_lock); static DEFINE_SPINLOCK(cgroup_idr_lock); /* + * Protects cgroup_file->kn for !self csses. It synchronizes notifications + * against file removal/re-creation across css hiding. + */ +static DEFINE_SPINLOCK(cgroup_file_kn_lock); + +/* * Protects cgroup_subsys->release_agent_path. Modifying it also requires * cgroup_mutex. Reading requires either cgroup_mutex or this spinlock. */ @@ -171,10 +181,16 @@ EXPORT_SYMBOL_GPL(cgrp_dfl_root); * The default hierarchy always exists but is hidden until mounted for the * first time. This is for backward compatibility. */ -static bool cgrp_dfl_root_visible; +static bool cgrp_dfl_visible; + +/* Controllers blocked by the commandline in v1 */ +static u16 cgroup_no_v1_mask; /* some controllers are not supported in the default hierarchy */ -static unsigned long cgrp_dfl_root_inhibit_ss_mask; +static u16 cgrp_dfl_inhibit_ss_mask; + +/* some controllers are implicitly enabled on the default hierarchy */ +static unsigned long cgrp_dfl_implicit_ss_mask; /* The list of hierarchy roots */ @@ -198,22 +214,34 @@ static u64 css_serial_nr_next = 1; * fork/exit handlers to call. This avoids us having to do extra work in the * fork/exit path to check which subsystems have fork/exit callbacks. */ -static unsigned long have_fork_callback __read_mostly; -static unsigned long have_exit_callback __read_mostly; -static unsigned long have_free_callback __read_mostly; +static u16 have_fork_callback __read_mostly; +static u16 have_exit_callback __read_mostly; +static u16 have_free_callback __read_mostly; + +/* cgroup namespace for init task */ +struct cgroup_namespace init_cgroup_ns = { + .count = { .counter = 2, }, + .user_ns = &init_user_ns, + .ns.ops = &cgroupns_operations, + .ns.inum = PROC_CGROUP_INIT_INO, + .root_cset = &init_css_set, +}; /* Ditto for the can_fork callback. */ -static unsigned long have_canfork_callback __read_mostly; +static u16 have_canfork_callback __read_mostly; +static struct file_system_type cgroup2_fs_type; static struct cftype cgroup_dfl_base_files[]; static struct cftype cgroup_legacy_base_files[]; -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask); +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask); +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp); +static int cgroup_apply_control(struct cgroup *cgrp); +static void cgroup_finalize_control(struct cgroup *cgrp, int ret); static void css_task_iter_advance(struct css_task_iter *it); static int cgroup_destroy_locked(struct cgroup *cgrp); -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible); +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss); static void css_release(struct percpu_ref *ref); static void kill_css(struct cgroup_subsys_state *css); static int cgroup_addrm_files(struct cgroup_subsys_state *css, @@ -230,9 +258,17 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, */ static bool cgroup_ssid_enabled(int ssid) { + if (CGROUP_SUBSYS_COUNT == 0) + return false; + return static_key_enabled(cgroup_subsys_enabled_key[ssid]); } +static bool cgroup_ssid_no_v1(int ssid) +{ + return cgroup_no_v1_mask & (1 << ssid); +} + /** * cgroup_on_dfl - test whether a cgroup is on the default hierarchy * @cgrp: the cgroup of interest @@ -299,7 +335,7 @@ static int cgroup_idr_alloc(struct idr *idr, void *ptr, int start, int end, idr_preload(gfp_mask); spin_lock_bh(&cgroup_idr_lock); - ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_WAIT); + ret = idr_alloc(idr, ptr, start, end, gfp_mask & ~__GFP_DIRECT_RECLAIM); spin_unlock_bh(&cgroup_idr_lock); idr_preload_end(); return ret; @@ -331,6 +367,32 @@ static struct cgroup *cgroup_parent(struct cgroup *cgrp) return NULL; } +/* subsystems visibly enabled on a cgroup */ +static u16 cgroup_control(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + u16 root_ss_mask = cgrp->root->subsys_mask; + + if (parent) + return parent->subtree_control; + + if (cgroup_on_dfl(cgrp)) + root_ss_mask &= ~(cgrp_dfl_inhibit_ss_mask | + cgrp_dfl_implicit_ss_mask); + return root_ss_mask; +} + +/* subsystems enabled on a cgroup */ +static u16 cgroup_ss_mask(struct cgroup *cgrp) +{ + struct cgroup *parent = cgroup_parent(cgrp); + + if (parent) + return parent->subtree_ss_mask; + + return cgrp->root->subsys_mask; +} + /** * cgroup_css - obtain a cgroup's css for the specified subsystem * @cgrp: the cgroup of interest @@ -370,16 +432,15 @@ static struct cgroup_subsys_state *cgroup_e_css(struct cgroup *cgrp, if (!ss) return &cgrp->self; - if (!(cgrp->root->subsys_mask & (1 << ss->id))) - return NULL; - /* * This function is used while updating css associations and thus - * can't test the csses directly. Use ->child_subsys_mask. + * can't test the csses directly. Test ss_mask. */ - while (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->child_subsys_mask & (1 << ss->id))) + while (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) { cgrp = cgroup_parent(cgrp); + if (!cgrp) + return NULL; + } return cgroup_css(cgrp, ss); } @@ -434,11 +495,6 @@ static bool cgroup_tryget(struct cgroup *cgrp) return css_tryget(&cgrp->self); } -static void cgroup_put(struct cgroup *cgrp) -{ - css_put(&cgrp->self); -} - struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) { struct cgroup *cgrp = of->kn->parent->priv; @@ -459,25 +515,6 @@ struct cgroup_subsys_state *of_css(struct kernfs_open_file *of) } EXPORT_SYMBOL_GPL(of_css); -/** - * cgroup_is_descendant - test ancestry - * @cgrp: the cgroup to be tested - * @ancestor: possible ancestor of @cgrp - * - * Test whether @cgrp is a descendant of @ancestor. It also returns %true - * if @cgrp == @ancestor. This function is safe to call as long as @cgrp - * and @ancestor are accessible. - */ -bool cgroup_is_descendant(struct cgroup *cgrp, struct cgroup *ancestor) -{ - while (cgrp) { - if (cgrp == ancestor) - return true; - cgrp = cgroup_parent(cgrp); - } - return false; -} - static int notify_on_release(const struct cgroup *cgrp) { return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -522,22 +559,28 @@ static int notify_on_release(const struct cgroup *cgrp) (((ss) = cgroup_subsys[ssid]) || true); (ssid)++) /** - * for_each_subsys_which - filter for_each_subsys with a bitmask + * do_each_subsys_mask - filter for_each_subsys with a bitmask * @ss: the iteration cursor * @ssid: the index of @ss, CGROUP_SUBSYS_COUNT after reaching the end - * @ss_maskp: a pointer to the bitmask + * @ss_mask: the bitmask * * The block will only run for cases where the ssid-th bit (1 << ssid) of - * mask is set to 1. + * @ss_mask is set. */ -#define for_each_subsys_which(ss, ssid, ss_maskp) \ - if (!CGROUP_SUBSYS_COUNT) /* to avoid spurious gcc warning */ \ +#define do_each_subsys_mask(ss, ssid, ss_mask) do { \ + unsigned long __ss_mask = (ss_mask); \ + if (!CGROUP_SUBSYS_COUNT) { /* to avoid spurious gcc warning */ \ (ssid) = 0; \ - else \ - for_each_set_bit(ssid, ss_maskp, CGROUP_SUBSYS_COUNT) \ - if (((ss) = cgroup_subsys[ssid]) && false) \ - break; \ - else + break; \ + } \ + for_each_set_bit(ssid, &__ss_mask, CGROUP_SUBSYS_COUNT) { \ + (ss) = cgroup_subsys[ssid]; \ + { + +#define while_each_subsys_mask() \ + } \ + } \ +} while (false) /* iterate across the hierarchies */ #define for_each_root(root) \ @@ -551,6 +594,24 @@ static int notify_on_release(const struct cgroup *cgrp) ; \ else +/* walk live descendants in preorder */ +#define cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) \ + css_for_each_descendant_pre((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + +/* walk live descendants in postorder */ +#define cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) \ + css_for_each_descendant_post((d_css), cgroup_css((cgrp), NULL)) \ + if (({ lockdep_assert_held(&cgroup_mutex); \ + (dsct) = (d_css)->cgroup; \ + cgroup_is_dead(dsct); })) \ + ; \ + else + static void cgroup_release_agent(struct work_struct *work); static void check_for_release(struct cgroup *cgrp); @@ -681,6 +742,9 @@ static void css_set_move_task(struct task_struct *task, { lockdep_assert_held(&css_set_lock); + if (to_cset && !css_set_populated(to_cset)) + css_set_update_populated(to_cset, true); + if (from_cset) { struct css_task_iter *it, *pos; @@ -714,8 +778,6 @@ static void css_set_move_task(struct task_struct *task, */ WARN_ON_ONCE(task->flags & PF_EXITING); - if (!css_set_populated(to_cset)) - css_set_update_populated(to_cset, true); rcu_assign_pointer(task->cgroups, to_cset); list_add_tail(&task->cg_list, use_mg_tasks ? &to_cset->mg_tasks : &to_cset->tasks); @@ -754,9 +816,11 @@ static void put_css_set_locked(struct css_set *cset) if (!atomic_dec_and_test(&cset->refcount)) return; - /* This css_set is dead. unlink it and release cgroup refcounts */ - for_each_subsys(ss, ssid) + /* This css_set is dead. unlink it and release cgroup and css refs */ + for_each_subsys(ss, ssid) { list_del(&cset->e_cset_node[ssid]); + css_put(cset->subsys[ssid]); + } hash_del(&cset->hlist); css_set_count--; @@ -1056,9 +1120,13 @@ static struct css_set *find_css_set(struct css_set *old_cset, key = css_set_hash(cset->subsys); hash_add(css_set_table, &cset->hlist, key); - for_each_subsys(ss, ssid) + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cset->subsys[ssid]; + list_add_tail(&cset->e_cset_node[ssid], - &cset->subsys[ssid]->cgroup->e_csets[ssid]); + &css->cgroup->e_csets[ssid]); + css_get(css); + } spin_unlock_bh(&css_set_lock); @@ -1112,13 +1180,13 @@ static void cgroup_destroy_root(struct cgroup_root *root) struct cgroup *cgrp = &root->cgrp; struct cgrp_cset_link *link, *tmp_link; - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); BUG_ON(atomic_read(&root->nr_cgrps)); BUG_ON(!list_empty(&cgrp->self.children)); /* Rebind all subsystems back to the default hierarchy */ - rebind_subsystems(&cgrp_dfl_root, root->subsys_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, root->subsys_mask)); /* * Release all the links from cset_links to this hierarchy's @@ -1258,46 +1326,40 @@ static umode_t cgroup_file_mode(const struct cftype *cft) } /** - * cgroup_calc_child_subsys_mask - calculate child_subsys_mask - * @cgrp: the target cgroup + * cgroup_calc_subtree_ss_mask - calculate subtree_ss_mask * @subtree_control: the new subtree_control mask to consider + * @this_ss_mask: available subsystems * * On the default hierarchy, a subsystem may request other subsystems to be * enabled together through its ->depends_on mask. In such cases, more * subsystems than specified in "cgroup.subtree_control" may be enabled. * * This function calculates which subsystems need to be enabled if - * @subtree_control is to be applied to @cgrp. The returned mask is always - * a superset of @subtree_control and follows the usual hierarchy rules. + * @subtree_control is to be applied while restricted to @this_ss_mask. */ -static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, - unsigned long subtree_control) +static u16 cgroup_calc_subtree_ss_mask(u16 subtree_control, u16 this_ss_mask) { - struct cgroup *parent = cgroup_parent(cgrp); - unsigned long cur_ss_mask = subtree_control; + u16 cur_ss_mask = subtree_control; struct cgroup_subsys *ss; int ssid; lockdep_assert_held(&cgroup_mutex); - if (!cgroup_on_dfl(cgrp)) - return cur_ss_mask; + cur_ss_mask |= cgrp_dfl_implicit_ss_mask; while (true) { - unsigned long new_ss_mask = cur_ss_mask; + u16 new_ss_mask = cur_ss_mask; - for_each_subsys_which(ss, ssid, &cur_ss_mask) + do_each_subsys_mask(ss, ssid, cur_ss_mask) { new_ss_mask |= ss->depends_on; + } while_each_subsys_mask(); /* * Mask out subsystems which aren't available. This can * happen only if some depended-upon subsystems were bound * to non-default hierarchies. */ - if (parent) - new_ss_mask &= parent->child_subsys_mask; - else - new_ss_mask &= cgrp->root->subsys_mask; + new_ss_mask &= this_ss_mask; if (new_ss_mask == cur_ss_mask) break; @@ -1308,19 +1370,6 @@ static unsigned long cgroup_calc_child_subsys_mask(struct cgroup *cgrp, } /** - * cgroup_refresh_child_subsys_mask - update child_subsys_mask - * @cgrp: the target cgroup - * - * Update @cgrp->child_subsys_mask according to the current - * @cgrp->subtree_control using cgroup_calc_child_subsys_mask(). - */ -static void cgroup_refresh_child_subsys_mask(struct cgroup *cgrp) -{ - cgrp->child_subsys_mask = - cgroup_calc_child_subsys_mask(cgrp, cgrp->subtree_control); -} - -/** * cgroup_kn_unlock - unlocking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced * @@ -1348,19 +1397,22 @@ static void cgroup_kn_unlock(struct kernfs_node *kn) /** * cgroup_kn_lock_live - locking helper for cgroup kernfs methods * @kn: the kernfs_node being serviced + * @drain_offline: perform offline draining on the cgroup * * This helper is to be used by a cgroup kernfs method currently servicing * @kn. It breaks the active protection, performs cgroup locking and * verifies that the associated cgroup is alive. Returns the cgroup if * alive; otherwise, %NULL. A successful return should be undone by a - * matching cgroup_kn_unlock() invocation. + * matching cgroup_kn_unlock() invocation. If @drain_offline is %true, the + * cgroup is drained of offlining csses before return. * * Any cgroup kernfs method implementation which requires locking the * associated cgroup should use this helper. It avoids nesting cgroup * locking under kernfs active protection and allows all kernfs operations * including self-removal. */ -static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) +static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn, + bool drain_offline) { struct cgroup *cgrp; @@ -1379,7 +1431,10 @@ static struct cgroup *cgroup_kn_lock_live(struct kernfs_node *kn) return NULL; kernfs_break_active_protection(kn); - mutex_lock(&cgroup_mutex); + if (drain_offline) + cgroup_lock_and_drain_offline(cgrp); + else + mutex_lock(&cgroup_mutex); if (!cgroup_is_dead(cgrp)) return cgrp; @@ -1393,20 +1448,33 @@ static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) char name[CGROUP_FILE_NAME_MAX]; lockdep_assert_held(&cgroup_mutex); + + if (cft->file_offset) { + struct cgroup_subsys_state *css = cgroup_css(cgrp, cft->ss); + struct cgroup_file *cfile = (void *)css + cft->file_offset; + + spin_lock_irq(&cgroup_file_kn_lock); + cfile->kn = NULL; + spin_unlock_irq(&cgroup_file_kn_lock); + } + kernfs_remove_by_name(cgrp->kn, cgroup_file_name(cgrp, cft, name)); } /** * css_clear_dir - remove subsys files in a cgroup directory * @css: taget css - * @cgrp_override: specify if target cgroup is different from css->cgroup */ -static void css_clear_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static void css_clear_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts; + if (!(css->flags & CSS_VISIBLE)) + return; + + css->flags &= ~CSS_VISIBLE; + list_for_each_entry(cfts, &css->ss->cfts, node) cgroup_addrm_files(css, cgrp, cfts, false); } @@ -1414,17 +1482,18 @@ static void css_clear_dir(struct cgroup_subsys_state *css, /** * css_populate_dir - create subsys files in a cgroup directory * @css: target css - * @cgrp_overried: specify if target cgroup is different from css->cgroup * * On failure, no file is added. */ -static int css_populate_dir(struct cgroup_subsys_state *css, - struct cgroup *cgrp_override) +static int css_populate_dir(struct cgroup_subsys_state *css) { - struct cgroup *cgrp = cgrp_override ?: css->cgroup; + struct cgroup *cgrp = css->cgroup; struct cftype *cfts, *failed_cfts; int ret; + if ((css->flags & CSS_VISIBLE) || !cgrp->kn) + return 0; + if (!css->ss) { if (cgroup_on_dfl(cgrp)) cfts = cgroup_dfl_base_files; @@ -1441,6 +1510,9 @@ static int css_populate_dir(struct cgroup_subsys_state *css, goto err; } } + + css->flags |= CSS_VISIBLE; + return 0; err: list_for_each_entry(cfts, &css->ss->cfts, node) { @@ -1451,67 +1523,30 @@ err: return ret; } -static int rebind_subsystems(struct cgroup_root *dst_root, - unsigned long ss_mask) +static int rebind_subsystems(struct cgroup_root *dst_root, u16 ss_mask) { struct cgroup *dcgrp = &dst_root->cgrp; struct cgroup_subsys *ss; - unsigned long tmp_ss_mask; int ssid, i, ret; lockdep_assert_held(&cgroup_mutex); - for_each_subsys_which(ss, ssid, &ss_mask) { - /* if @ss has non-root csses attached to it, can't move */ - if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss))) + do_each_subsys_mask(ss, ssid, ss_mask) { + /* + * If @ss has non-root csses attached to it, can't move. + * If @ss is an implicit controller, it is exempt from this + * rule and can be stolen. + */ + if (css_next_child(NULL, cgroup_css(&ss->root->cgrp, ss)) && + !ss->implicit_on_dfl) return -EBUSY; /* can't move between two non-dummy roots either */ if (ss->root != &cgrp_dfl_root && dst_root != &cgrp_dfl_root) return -EBUSY; - } - - /* skip creating root files on dfl_root for inhibited subsystems */ - tmp_ss_mask = ss_mask; - if (dst_root == &cgrp_dfl_root) - tmp_ss_mask &= ~cgrp_dfl_root_inhibit_ss_mask; + } while_each_subsys_mask(); - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { - struct cgroup *scgrp = &ss->root->cgrp; - int tssid; - - ret = css_populate_dir(cgroup_css(scgrp, ss), dcgrp); - if (!ret) - continue; - - /* - * Rebinding back to the default root is not allowed to - * fail. Using both default and non-default roots should - * be rare. Moving subsystems back and forth even more so. - * Just warn about it and continue. - */ - if (dst_root == &cgrp_dfl_root) { - if (cgrp_dfl_root_visible) { - pr_warn("failed to create files (%d) while rebinding 0x%lx to default root\n", - ret, ss_mask); - pr_warn("you may retry by moving them to a different hierarchy and unbinding\n"); - } - continue; - } - - for_each_subsys_which(ss, tssid, &tmp_ss_mask) { - if (tssid == ssid) - break; - css_clear_dir(cgroup_css(scgrp, ss), dcgrp); - } - return ret; - } - - /* - * Nothing can fail from this point on. Remove files for the - * removed subsystems and rebind each subsystem. - */ - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { struct cgroup_root *src_root = ss->root; struct cgroup *scgrp = &src_root->cgrp; struct cgroup_subsys_state *css = cgroup_css(scgrp, ss); @@ -1519,8 +1554,12 @@ static int rebind_subsystems(struct cgroup_root *dst_root, WARN_ON(!css || cgroup_css(dcgrp, ss)); - css_clear_dir(css, NULL); + /* disable from the source */ + src_root->subsys_mask &= ~(1 << ssid); + WARN_ON(cgroup_apply_control(scgrp)); + cgroup_finalize_control(scgrp, 0); + /* rebind */ RCU_INIT_POINTER(scgrp->subsys[ssid], NULL); rcu_assign_pointer(dcgrp->subsys[ssid], css); ss->root = dst_root; @@ -1532,23 +1571,23 @@ static int rebind_subsystems(struct cgroup_root *dst_root, &dcgrp->e_csets[ss->id]); spin_unlock_bh(&css_set_lock); - src_root->subsys_mask &= ~(1 << ssid); - scgrp->subtree_control &= ~(1 << ssid); - cgroup_refresh_child_subsys_mask(scgrp); - /* default hierarchy doesn't enable controllers by default */ dst_root->subsys_mask |= 1 << ssid; if (dst_root == &cgrp_dfl_root) { static_branch_enable(cgroup_subsys_on_dfl_key[ssid]); } else { dcgrp->subtree_control |= 1 << ssid; - cgroup_refresh_child_subsys_mask(dcgrp); static_branch_disable(cgroup_subsys_on_dfl_key[ssid]); } + ret = cgroup_apply_control(dcgrp); + if (ret) + pr_warn("partial failure to rebind %s controller (err=%d)\n", + ss->name, ret); + if (ss->bind) ss->bind(css); - } + } while_each_subsys_mask(); kernfs_activate(dcgrp->kn); return 0; @@ -1584,7 +1623,7 @@ static int cgroup_show_options(struct seq_file *seq, } struct cgroup_sb_opts { - unsigned long subsys_mask; + u16 subsys_mask; unsigned int flags; char *release_agent; bool cpuset_clone_children; @@ -1597,13 +1636,13 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) { char *token, *o = data; bool all_ss = false, one_ss = false; - unsigned long mask = -1UL; + u16 mask = U16_MAX; struct cgroup_subsys *ss; int nr_opts = 0; int i; #ifdef CONFIG_CPUSETS - mask = ~(1U << cpuset_cgrp_id); + mask = ~((u16)1 << cpuset_cgrp_id); #endif memset(opts, 0, sizeof(*opts)); @@ -1625,10 +1664,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) all_ss = true; continue; } - if (!strcmp(token, "__DEVEL__sane_behavior")) { - opts->flags |= CGRP_ROOT_SANE_BEHAVIOR; - continue; - } if (!strcmp(token, "noprefix")) { opts->flags |= CGRP_ROOT_NOPREFIX; continue; @@ -1682,6 +1717,8 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) continue; if (!cgroup_ssid_enabled(i)) continue; + if (cgroup_ssid_no_v1(i)) + continue; /* Mutually exclusive option 'all' + subsystem name */ if (all_ss) @@ -1695,15 +1732,6 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) return -ENOENT; } - if (opts->flags & CGRP_ROOT_SANE_BEHAVIOR) { - pr_warn("sane_behavior: this is still under development and its behaviors will change, proceed at your own risk\n"); - if (nr_opts != 1) { - pr_err("sane_behavior: no other mount options allowed\n"); - return -EINVAL; - } - return 0; - } - /* * If the 'all' option was specified select all the subsystems, * otherwise if 'none', 'name=' and a subsystem name options were @@ -1711,7 +1739,7 @@ static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) */ if (all_ss || (!one_ss && !opts->none && !opts->name)) for_each_subsys(ss, i) - if (cgroup_ssid_enabled(i)) + if (cgroup_ssid_enabled(i) && !cgroup_ssid_no_v1(i)) opts->subsys_mask |= (1 << i); /* @@ -1741,14 +1769,14 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) int ret = 0; struct cgroup_root *root = cgroup_root_from_kf(kf_root); struct cgroup_sb_opts opts; - unsigned long added_mask, removed_mask; + u16 added_mask, removed_mask; if (root == &cgrp_dfl_root) { pr_err("remount is not allowed\n"); return -EINVAL; } - mutex_lock(&cgroup_mutex); + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* See what subsystems are wanted */ ret = parse_cgroupfs_options(data, &opts); @@ -1781,7 +1809,7 @@ static int cgroup_remount(struct kernfs_root *kf_root, int *flags, char *data) if (ret) goto out_unlock; - rebind_subsystems(&cgrp_dfl_root, removed_mask); + WARN_ON(rebind_subsystems(&cgrp_dfl_root, removed_mask)); if (opts.release_agent) { spin_lock(&release_agent_path_lock); @@ -1856,7 +1884,6 @@ static void init_cgroup_housekeeping(struct cgroup *cgrp) INIT_LIST_HEAD(&cgrp->self.sibling); INIT_LIST_HEAD(&cgrp->self.children); - INIT_LIST_HEAD(&cgrp->self.files); INIT_LIST_HEAD(&cgrp->cset_links); INIT_LIST_HEAD(&cgrp->pidlists); mutex_init(&cgrp->pidlist_mutex); @@ -1890,7 +1917,7 @@ static void init_cgroup_root(struct cgroup_root *root, set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->cgrp.flags); } -static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) +static int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask) { LIST_HEAD(tmp_links); struct cgroup *root_cgrp = &root->cgrp; @@ -1903,6 +1930,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) if (ret < 0) goto out; root_cgrp->id = ret; + root_cgrp->ancestor_ids[0] = ret; ret = percpu_ref_init(&root_cgrp->self.refcnt, css_release, 0, GFP_KERNEL); @@ -1912,10 +1940,11 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) /* * We're accessing css_set_count without locking css_set_lock here, * but that's OK - it can only be increased by someone holding - * cgroup_lock, and that's us. The worst that can happen is that we - * have some link structures left over + * cgroup_lock, and that's us. Later rebinding may disable + * controllers on the default hierarchy and thus create new csets, + * which can't be more than the existing ones. Allocate 2x. */ - ret = allocate_cgrp_cset_links(css_set_count, &tmp_links); + ret = allocate_cgrp_cset_links(2 * css_set_count, &tmp_links); if (ret) goto cancel_ref; @@ -1932,7 +1961,7 @@ static int cgroup_setup_root(struct cgroup_root *root, unsigned long ss_mask) } root_cgrp->kn = root->kf_root->kn; - ret = css_populate_dir(&root_cgrp->self, NULL); + ret = css_populate_dir(&root_cgrp->self); if (ret) goto destroy_root; @@ -1983,7 +2012,9 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + bool is_v2 = fs_type == &cgroup2_fs_type; struct super_block *pinned_sb = NULL; + struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup_subsys *ss; struct cgroup_root *root; struct cgroup_sb_opts opts; @@ -1992,6 +2023,14 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, int i; bool new_sb; + get_cgroup_ns(ns); + + /* Check if the caller has permission to mount. */ + if (!ns_capable(ns->user_ns, CAP_SYS_ADMIN)) { + put_cgroup_ns(ns); + return ERR_PTR(-EPERM); + } + /* * The first time anyone tries to mount a cgroup, enable the list * linking each css_set to its tasks and fix up all existing tasks. @@ -1999,22 +2038,25 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, if (!use_task_css_set_links) cgroup_enable_task_cg_lists(); - mutex_lock(&cgroup_mutex); + if (is_v2) { + if (data) { + pr_err("cgroup2: unknown option \"%s\"\n", (char *)data); + put_cgroup_ns(ns); + return ERR_PTR(-EINVAL); + } + cgrp_dfl_visible = true; + root = &cgrp_dfl_root; + cgroup_get(&root->cgrp); + goto out_mount; + } + + cgroup_lock_and_drain_offline(&cgrp_dfl_root.cgrp); /* First find the desired set of subsystems */ ret = parse_cgroupfs_options(data, &opts); if (ret) goto out_unlock; - /* look for a matching existing root */ - if (opts.flags & CGRP_ROOT_SANE_BEHAVIOR) { - cgrp_dfl_root_visible = true; - root = &cgrp_dfl_root; - cgroup_get(&root->cgrp); - ret = 0; - goto out_unlock; - } - /* * Destruction of cgroup root is asynchronous, so subsystems may * still be dying after the previous unmount. Let's drain the @@ -2105,6 +2147,16 @@ static struct dentry *cgroup_mount(struct file_system_type *fs_type, goto out_unlock; } + /* + * We know this subsystem has not yet been bound. Users in a non-init + * user namespace may only mount hierarchies with no bound subsystems, + * i.e. 'none,name=user1' + */ + if (!opts.none && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out_unlock; + } + root = kzalloc(sizeof(*root), GFP_KERNEL); if (!root) { ret = -ENOMEM; @@ -2123,11 +2175,37 @@ out_free: kfree(opts.release_agent); kfree(opts.name); - if (ret) + if (ret) { + put_cgroup_ns(ns); return ERR_PTR(ret); - + } +out_mount: dentry = kernfs_mount(fs_type, flags, root->kf_root, - CGROUP_SUPER_MAGIC, &new_sb); + is_v2 ? CGROUP2_SUPER_MAGIC : CGROUP_SUPER_MAGIC, + &new_sb); + + /* + * In non-init cgroup namespace, instead of root cgroup's + * dentry, we return the dentry corresponding to the + * cgroupns->root_cgrp. + */ + if (!IS_ERR(dentry) && ns != &init_cgroup_ns) { + struct dentry *nsdentry; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cgrp = cset_cgroup_from_root(ns->root_cset, root); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + nsdentry = kernfs_node_dentry(cgrp->kn, dentry->d_sb); + dput(dentry); + dentry = nsdentry; + } + if (IS_ERR(dentry) || !new_sb) cgroup_put(&root->cgrp); @@ -2140,6 +2218,7 @@ out_free: deactivate_super(pinned_sb); } + put_cgroup_ns(ns); return dentry; } @@ -2168,8 +2247,45 @@ static struct file_system_type cgroup_fs_type = { .name = "cgroup", .mount = cgroup_mount, .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, +}; + +static struct file_system_type cgroup2_fs_type = { + .name = "cgroup2", + .mount = cgroup_mount, + .kill_sb = cgroup_kill_sb, + .fs_flags = FS_USERNS_MOUNT, }; +static char *cgroup_path_ns_locked(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + struct cgroup *root = cset_cgroup_from_root(ns->root_cset, cgrp->root); + int ret; + + ret = kernfs_path_from_node(cgrp->kn, root->kn, buf, buflen); + if (ret < 0 || ret >= buflen) + return NULL; + return buf; +} + +char *cgroup_path_ns(struct cgroup *cgrp, char *buf, size_t buflen, + struct cgroup_namespace *ns) +{ + char *ret; + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + ret = cgroup_path_ns_locked(cgrp, buf, buflen, ns); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + return ret; +} +EXPORT_SYMBOL_GPL(cgroup_path_ns); + /** * task_cgroup_path - cgroup path of a task in the first cgroup hierarchy * @task: target task @@ -2197,7 +2313,7 @@ char *task_cgroup_path(struct task_struct *task, char *buf, size_t buflen) if (root) { cgrp = task_cgroup_from_root(task, root); - path = cgroup_path(cgrp, buf, buflen); + path = cgroup_path_ns_locked(cgrp, buf, buflen, &init_cgroup_ns); } else { /* if no hierarchy exists, everyone is in "/" */ if (strlcpy(buf, "/", buflen) < buflen) @@ -2216,6 +2332,9 @@ struct cgroup_taskset { struct list_head src_csets; struct list_head dst_csets; + /* the subsys currently being processed */ + int ssid; + /* * Fields for cgroup_taskset_*() iteration. * @@ -2278,25 +2397,29 @@ static void cgroup_taskset_add(struct task_struct *task, /** * cgroup_taskset_first - reset taskset and return the first task * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * @tset iteration is initialized and the first task is returned. */ -struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { tset->cur_cset = list_first_entry(tset->csets, struct css_set, mg_node); tset->cur_task = NULL; - return cgroup_taskset_next(tset); + return cgroup_taskset_next(tset, dst_cssp); } /** * cgroup_taskset_next - iterate to the next task in taskset * @tset: taskset of interest + * @dst_cssp: output variable for the destination css * * Return the next task in @tset. Iteration must have been initialized * with cgroup_taskset_first(). */ -struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) +struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset, + struct cgroup_subsys_state **dst_cssp) { struct css_set *cset = tset->cur_cset; struct task_struct *task = tset->cur_task; @@ -2311,6 +2434,18 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) if (&task->cg_list != &cset->mg_tasks) { tset->cur_cset = cset; tset->cur_task = task; + + /* + * This function may be called both before and + * after cgroup_taskset_migrate(). The two cases + * can be distinguished by looking at whether @cset + * has its ->mg_dst_cset set. + */ + if (cset->mg_dst_cset) + *dst_cssp = cset->mg_dst_cset->subsys[tset->ssid]; + else + *dst_cssp = cset->subsys[tset->ssid]; + return task; } @@ -2322,37 +2457,38 @@ struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) } /** - * cgroup_taskset_migrate - migrate a taskset to a cgroup + * cgroup_taskset_migrate - migrate a taskset * @tset: taget taskset - * @dst_cgrp: destination cgroup + * @root: cgroup root the migration is taking place on * - * Migrate tasks in @tset to @dst_cgrp. This function fails iff one of the - * ->can_attach callbacks fails and guarantees that either all or none of - * the tasks in @tset are migrated. @tset is consumed regardless of - * success. + * Migrate tasks in @tset as setup by migration preparation functions. + * This function fails iff one of the ->can_attach callbacks fails and + * guarantees that either all or none of the tasks in @tset are migrated. + * @tset is consumed regardless of success. */ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, - struct cgroup *dst_cgrp) + struct cgroup_root *root) { - struct cgroup_subsys_state *css, *failed_css = NULL; + struct cgroup_subsys *ss; struct task_struct *task, *tmp_task; struct css_set *cset, *tmp_cset; - int i, ret; + int ssid, failed_ssid, ret; /* methods shouldn't be called if no task is actually migrating */ if (list_empty(&tset->src_csets)) return 0; /* check that we can legitimately attach to the cgroup */ - for_each_e_css(css, i, dst_cgrp) { - if (css->ss->can_attach) { - ret = css->ss->can_attach(css, tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->can_attach) { + tset->ssid = ssid; + ret = ss->can_attach(tset); if (ret) { - failed_css = css; + failed_ssid = ssid; goto out_cancel_attach; } } - } + } while_each_subsys_mask(); /* * Now that we're guaranteed success, proceed to move all tasks to @@ -2379,20 +2515,25 @@ static int cgroup_taskset_migrate(struct cgroup_taskset *tset, */ tset->csets = &tset->dst_csets; - for_each_e_css(css, i, dst_cgrp) - if (css->ss->attach) - css->ss->attach(css, tset); + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ss->attach) { + tset->ssid = ssid; + ss->attach(tset); + } + } while_each_subsys_mask(); ret = 0; goto out_release_tset; out_cancel_attach: - for_each_e_css(css, i, dst_cgrp) { - if (css == failed_css) + do_each_subsys_mask(ss, ssid, root->subsys_mask) { + if (ssid == failed_ssid) break; - if (css->ss->cancel_attach) - css->ss->cancel_attach(css, tset); - } + if (ss->cancel_attach) { + tset->ssid = ssid; + ss->cancel_attach(tset); + } + } while_each_subsys_mask(); out_release_tset: spin_lock_bh(&css_set_lock); list_splice_init(&tset->dst_csets, &tset->src_csets); @@ -2405,6 +2546,20 @@ out_release_tset: } /** + * cgroup_may_migrate_to - verify whether a cgroup can be migration destination + * @dst_cgrp: destination cgroup to test + * + * On the default hierarchy, except for the root, subtree_control must be + * zero for migration destination cgroups with tasks so that child cgroups + * don't compete against tasks. + */ +static bool cgroup_may_migrate_to(struct cgroup *dst_cgrp) +{ + return !cgroup_on_dfl(dst_cgrp) || !cgroup_parent(dst_cgrp) || + !dst_cgrp->subtree_control; +} + +/** * cgroup_migrate_finish - cleanup after attach * @preloaded_csets: list of preloaded css_sets * @@ -2420,6 +2575,7 @@ static void cgroup_migrate_finish(struct list_head *preloaded_csets) spin_lock_bh(&css_set_lock); list_for_each_entry_safe(cset, tmp_cset, preloaded_csets, mg_preload_node) { cset->mg_src_cgrp = NULL; + cset->mg_dst_cgrp = NULL; cset->mg_dst_cset = NULL; list_del_init(&cset->mg_preload_node); put_css_set_locked(cset); @@ -2452,58 +2608,56 @@ static void cgroup_migrate_add_src(struct css_set *src_cset, lockdep_assert_held(&cgroup_mutex); lockdep_assert_held(&css_set_lock); + /* + * If ->dead, @src_set is associated with one or more dead cgroups + * and doesn't contain any migratable tasks. Ignore it early so + * that the rest of migration path doesn't get confused by it. + */ + if (src_cset->dead) + return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); if (!list_empty(&src_cset->mg_preload_node)) return; WARN_ON(src_cset->mg_src_cgrp); + WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); WARN_ON(!list_empty(&src_cset->mg_node)); src_cset->mg_src_cgrp = src_cgrp; + src_cset->mg_dst_cgrp = dst_cgrp; get_css_set(src_cset); list_add(&src_cset->mg_preload_node, preloaded_csets); } /** * cgroup_migrate_prepare_dst - prepare destination css_sets for migration - * @dst_cgrp: the destination cgroup (may be %NULL) * @preloaded_csets: list of preloaded source css_sets * - * Tasks are about to be moved to @dst_cgrp and all the source css_sets - * have been preloaded to @preloaded_csets. This function looks up and - * pins all destination css_sets, links each to its source, and append them - * to @preloaded_csets. If @dst_cgrp is %NULL, the destination of each - * source css_set is assumed to be its cgroup on the default hierarchy. + * Tasks are about to be moved and all the source css_sets have been + * preloaded to @preloaded_csets. This function looks up and pins all + * destination css_sets, links each to its source, and append them to + * @preloaded_csets. * * This function must be called after cgroup_migrate_add_src() has been * called on each migration source css_set. After migration is performed * using cgroup_migrate(), cgroup_migrate_finish() must be called on * @preloaded_csets. */ -static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, - struct list_head *preloaded_csets) +static int cgroup_migrate_prepare_dst(struct list_head *preloaded_csets) { LIST_HEAD(csets); struct css_set *src_cset, *tmp_cset; lockdep_assert_held(&cgroup_mutex); - /* - * Except for the root, child_subsys_mask must be zero for a cgroup - * with tasks so that child cgroups don't compete against tasks. - */ - if (dst_cgrp && cgroup_on_dfl(dst_cgrp) && cgroup_parent(dst_cgrp) && - dst_cgrp->child_subsys_mask) - return -EBUSY; - /* look up the dst cset for each src cset and link it to src */ list_for_each_entry_safe(src_cset, tmp_cset, preloaded_csets, mg_preload_node) { struct css_set *dst_cset; - dst_cset = find_css_set(src_cset, - dst_cgrp ?: src_cset->dfl_cgrp); + dst_cset = find_css_set(src_cset, src_cset->mg_dst_cgrp); if (!dst_cset) goto err; @@ -2516,6 +2670,7 @@ static int cgroup_migrate_prepare_dst(struct cgroup *dst_cgrp, */ if (src_cset == dst_cset) { src_cset->mg_src_cgrp = NULL; + src_cset->mg_dst_cgrp = NULL; list_del_init(&src_cset->mg_preload_node); put_css_set(src_cset); put_css_set(dst_cset); @@ -2541,11 +2696,11 @@ err: * cgroup_migrate - migrate a process or task to a cgroup * @leader: the leader of the process or the task to migrate * @threadgroup: whether @leader points to the whole process or a single task - * @cgrp: the destination cgroup + * @root: cgroup root migration is taking place on * - * Migrate a process or task denoted by @leader to @cgrp. If migrating a - * process, the caller must be holding cgroup_threadgroup_rwsem. The - * caller is also responsible for invoking cgroup_migrate_add_src() and + * Migrate a process or task denoted by @leader. If migrating a process, + * the caller must be holding cgroup_threadgroup_rwsem. The caller is also + * responsible for invoking cgroup_migrate_add_src() and * cgroup_migrate_prepare_dst() on the targets before invoking this * function and following up with cgroup_migrate_finish(). * @@ -2556,7 +2711,7 @@ err: * actually starting migrating. */ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, - struct cgroup *cgrp) + struct cgroup_root *root) { struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); struct task_struct *task; @@ -2577,7 +2732,7 @@ static int cgroup_migrate(struct task_struct *leader, bool threadgroup, rcu_read_unlock(); spin_unlock_bh(&css_set_lock); - return cgroup_taskset_migrate(&tset, cgrp); + return cgroup_taskset_migrate(&tset, root); } /** @@ -2595,6 +2750,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(dst_cgrp)) + return -EBUSY; + /* look up all src csets */ spin_lock_bh(&css_set_lock); rcu_read_lock(); @@ -2609,9 +2767,9 @@ static int cgroup_attach_task(struct cgroup *dst_cgrp, spin_unlock_bh(&css_set_lock); /* prepare dst csets and commit */ - ret = cgroup_migrate_prepare_dst(dst_cgrp, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (!ret) - ret = cgroup_migrate(leader, threadgroup, dst_cgrp); + ret = cgroup_migrate(leader, threadgroup, dst_cgrp->root); cgroup_migrate_finish(&preloaded_csets); return ret; @@ -2674,7 +2832,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, if (kstrtoint(strstrip(buf), 0, &pid) || pid < 0) return -EINVAL; - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; @@ -2718,6 +2876,7 @@ out_unlock_rcu: out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); + cpuset_post_attach_flush(); return ret ?: nbytes; } @@ -2771,7 +2930,7 @@ static ssize_t cgroup_release_agent_write(struct kernfs_open_file *of, BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, false); if (!cgrp) return -ENODEV; spin_lock(&release_agent_path_lock); @@ -2799,38 +2958,28 @@ static int cgroup_sane_behavior_show(struct seq_file *seq, void *v) return 0; } -static void cgroup_print_ss_mask(struct seq_file *seq, unsigned long ss_mask) +static void cgroup_print_ss_mask(struct seq_file *seq, u16 ss_mask) { struct cgroup_subsys *ss; bool printed = false; int ssid; - for_each_subsys_which(ss, ssid, &ss_mask) { + do_each_subsys_mask(ss, ssid, ss_mask) { if (printed) seq_putc(seq, ' '); seq_printf(seq, "%s", ss->name); printed = true; - } + } while_each_subsys_mask(); if (printed) seq_putc(seq, '\n'); } -/* show controllers which are currently attached to the default hierarchy */ -static int cgroup_root_controllers_show(struct seq_file *seq, void *v) -{ - struct cgroup *cgrp = seq_css(seq)->cgroup; - - cgroup_print_ss_mask(seq, cgrp->root->subsys_mask & - ~cgrp_dfl_root_inhibit_ss_mask); - return 0; -} - /* show controllers which are enabled from the parent */ static int cgroup_controllers_show(struct seq_file *seq, void *v) { struct cgroup *cgrp = seq_css(seq)->cgroup; - cgroup_print_ss_mask(seq, cgroup_parent(cgrp)->subtree_control); + cgroup_print_ss_mask(seq, cgroup_control(cgrp)); return 0; } @@ -2847,16 +2996,17 @@ static int cgroup_subtree_control_show(struct seq_file *seq, void *v) * cgroup_update_dfl_csses - update css assoc of a subtree in default hierarchy * @cgrp: root of the subtree to update csses for * - * @cgrp's child_subsys_mask has changed and its subtree's (self excluded) - * css associations need to be updated accordingly. This function looks up - * all css_sets which are attached to the subtree, creates the matching - * updated css_sets and migrates the tasks to the new ones. + * @cgrp's control masks have changed and its subtree's css associations + * need to be updated accordingly. This function looks up all css_sets + * which are attached to the subtree, creates the matching updated css_sets + * and migrates the tasks to the new ones. */ static int cgroup_update_dfl_csses(struct cgroup *cgrp) { LIST_HEAD(preloaded_csets); struct cgroup_taskset tset = CGROUP_TASKSET_INIT(tset); - struct cgroup_subsys_state *css; + struct cgroup_subsys_state *d_css; + struct cgroup *dsct; struct css_set *src_cset; int ret; @@ -2866,21 +3016,17 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) /* look up all csses currently attached to @cgrp's subtree */ spin_lock_bh(&css_set_lock); - css_for_each_descendant_pre(css, cgroup_css(cgrp, NULL)) { + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { struct cgrp_cset_link *link; - /* self is not affected by child_subsys_mask change */ - if (css->cgroup == cgrp) - continue; - - list_for_each_entry(link, &css->cgroup->cset_links, cset_link) - cgroup_migrate_add_src(link->cset, cgrp, + list_for_each_entry(link, &dsct->cset_links, cset_link) + cgroup_migrate_add_src(link->cset, dsct, &preloaded_csets); } spin_unlock_bh(&css_set_lock); /* NULL dst indicates self on default hierarchy */ - ret = cgroup_migrate_prepare_dst(NULL, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_finish; @@ -2898,20 +3044,272 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp) } spin_unlock_bh(&css_set_lock); - ret = cgroup_taskset_migrate(&tset, cgrp); + ret = cgroup_taskset_migrate(&tset, cgrp->root); out_finish: cgroup_migrate_finish(&preloaded_csets); percpu_up_write(&cgroup_threadgroup_rwsem); return ret; } +/** + * cgroup_lock_and_drain_offline - lock cgroup_mutex and drain offlined csses + * @cgrp: root of the target subtree + * + * Because css offlining is asynchronous, userland may try to re-enable a + * controller while the previous css is still around. This function grabs + * cgroup_mutex and drains the previous css instances of @cgrp's subtree. + */ +static void cgroup_lock_and_drain_offline(struct cgroup *cgrp) + __acquires(&cgroup_mutex) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid; + +restart: + mutex_lock(&cgroup_mutex); + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + DEFINE_WAIT(wait); + + if (!css || !percpu_ref_is_dying(&css->refcnt)) + continue; + + cgroup_get(dsct); + prepare_to_wait(&dsct->offline_waitq, &wait, + TASK_UNINTERRUPTIBLE); + + mutex_unlock(&cgroup_mutex); + schedule(); + finish_wait(&dsct->offline_waitq, &wait); + + cgroup_put(dsct); + goto restart; + } + } +} + +/** + * cgroup_save_control - save control masks of a subtree + * @cgrp: root of the target subtree + * + * Save ->subtree_control and ->subtree_ss_mask to the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_save_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->old_subtree_control = dsct->subtree_control; + dsct->old_subtree_ss_mask = dsct->subtree_ss_mask; + } +} + +/** + * cgroup_propagate_control - refresh control masks of a subtree + * @cgrp: root of the target subtree + * + * For @cgrp and its subtree, ensure ->subtree_ss_mask matches + * ->subtree_control and propagate controller availability through the + * subtree so that descendants don't have unavailable controllers enabled. + */ +static void cgroup_propagate_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + dsct->subtree_control &= cgroup_control(dsct); + dsct->subtree_ss_mask = + cgroup_calc_subtree_ss_mask(dsct->subtree_control, + cgroup_ss_mask(dsct)); + } +} + +/** + * cgroup_restore_control - restore control masks of a subtree + * @cgrp: root of the target subtree + * + * Restore ->subtree_control and ->subtree_ss_mask from the respective old_ + * prefixed fields for @cgrp's subtree including @cgrp itself. + */ +static void cgroup_restore_control(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + dsct->subtree_control = dsct->old_subtree_control; + dsct->subtree_ss_mask = dsct->old_subtree_ss_mask; + } +} + +static bool css_visible(struct cgroup_subsys_state *css) +{ + struct cgroup_subsys *ss = css->ss; + struct cgroup *cgrp = css->cgroup; + + if (cgroup_control(cgrp) & (1 << ss->id)) + return true; + if (!(cgroup_ss_mask(cgrp) & (1 << ss->id))) + return false; + return cgroup_on_dfl(cgrp) && ss->implicit_on_dfl; +} + +/** + * cgroup_apply_control_enable - enable or show csses according to control + * @cgrp: root of the target subtree + * + * Walk @cgrp's subtree and create new csses or make the existing ones + * visible. A css is created invisible if it's being implicitly enabled + * through dependency. An invisible css is made visible when the userland + * explicitly enables it. + * + * Returns 0 on success, -errno on failure. On failure, csses which have + * been processed already aren't cleaned up. The caller is responsible for + * cleaning up with cgroup_apply_control_disble(). + */ +static int cgroup_apply_control_enable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid, ret; + + cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + + if (!(cgroup_ss_mask(dsct) & (1 << ss->id))) + continue; + + if (!css) { + css = css_create(dsct, ss); + if (IS_ERR(css)) + return PTR_ERR(css); + } + + if (css_visible(css)) { + ret = css_populate_dir(css); + if (ret) + return ret; + } + } + } + + return 0; +} + +/** + * cgroup_apply_control_disable - kill or hide csses according to control + * @cgrp: root of the target subtree + * + * Walk @cgrp's subtree and kill and hide csses so that they match + * cgroup_ss_mask() and cgroup_visible_mask(). + * + * A css is hidden when the userland requests it to be disabled while other + * subsystems are still depending on it. The css must not actively control + * resources and be in the vanilla state if it's made visible again later. + * Controllers which may be depended upon should provide ->css_reset() for + * this purpose. + */ +static void cgroup_apply_control_disable(struct cgroup *cgrp) +{ + struct cgroup *dsct; + struct cgroup_subsys_state *d_css; + struct cgroup_subsys *ss; + int ssid; + + cgroup_for_each_live_descendant_post(dsct, d_css, cgrp) { + for_each_subsys(ss, ssid) { + struct cgroup_subsys_state *css = cgroup_css(dsct, ss); + + WARN_ON_ONCE(css && percpu_ref_is_dying(&css->refcnt)); + + if (!css) + continue; + + if (css->parent && + !(cgroup_ss_mask(dsct) & (1 << ss->id))) { + kill_css(css); + } else if (!css_visible(css)) { + css_clear_dir(css); + if (ss->css_reset) + ss->css_reset(css); + } + } + } +} + +/** + * cgroup_apply_control - apply control mask updates to the subtree + * @cgrp: root of the target subtree + * + * subsystems can be enabled and disabled in a subtree using the following + * steps. + * + * 1. Call cgroup_save_control() to stash the current state. + * 2. Update ->subtree_control masks in the subtree as desired. + * 3. Call cgroup_apply_control() to apply the changes. + * 4. Optionally perform other related operations. + * 5. Call cgroup_finalize_control() to finish up. + * + * This function implements step 3 and propagates the mask changes + * throughout @cgrp's subtree, updates csses accordingly and perform + * process migrations. + */ +static int cgroup_apply_control(struct cgroup *cgrp) +{ + int ret; + + cgroup_propagate_control(cgrp); + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + return ret; + + /* + * At this point, cgroup_e_css() results reflect the new csses + * making the following cgroup_update_dfl_csses() properly update + * css associations of all tasks in the subtree. + */ + ret = cgroup_update_dfl_csses(cgrp); + if (ret) + return ret; + + return 0; +} + +/** + * cgroup_finalize_control - finalize control mask update + * @cgrp: root of the target subtree + * @ret: the result of the update + * + * Finalize control mask update. See cgroup_apply_control() for more info. + */ +static void cgroup_finalize_control(struct cgroup *cgrp, int ret) +{ + if (ret) { + cgroup_restore_control(cgrp); + cgroup_propagate_control(cgrp); + } + + cgroup_apply_control_disable(cgrp); +} + /* change the enabled child controllers for a cgroup in the default hierarchy */ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - unsigned long enable = 0, disable = 0; - unsigned long css_enable, css_disable, old_sc, new_sc, old_ss, new_ss; + u16 enable = 0, disable = 0; struct cgroup *cgrp, *child; struct cgroup_subsys *ss; char *tok; @@ -2923,11 +3321,9 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, */ buf = strstrip(buf); while ((tok = strsep(&buf, " "))) { - unsigned long tmp_ss_mask = ~cgrp_dfl_root_inhibit_ss_mask; - if (tok[0] == '\0') continue; - for_each_subsys_which(ss, ssid, &tmp_ss_mask) { + do_each_subsys_mask(ss, ssid, ~cgrp_dfl_inhibit_ss_mask) { if (!cgroup_ssid_enabled(ssid) || strcmp(tok + 1, ss->name)) continue; @@ -2942,12 +3338,12 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, return -EINVAL; } break; - } + } while_each_subsys_mask(); if (ssid == CGROUP_SUBSYS_COUNT) return -EINVAL; } - cgrp = cgroup_kn_lock_live(of->kn); + cgrp = cgroup_kn_lock_live(of->kn, true); if (!cgrp) return -ENODEV; @@ -2958,10 +3354,7 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, continue; } - /* unavailable or not enabled on the parent? */ - if (!(cgrp_dfl_root.subsys_mask & (1 << ssid)) || - (cgroup_parent(cgrp) && - !(cgroup_parent(cgrp)->subtree_control & (1 << ssid)))) { + if (!(cgroup_control(cgrp) & (1 << ssid))) { ret = -ENOENT; goto out_unlock; } @@ -2995,150 +3388,21 @@ static ssize_t cgroup_subtree_control_write(struct kernfs_open_file *of, goto out_unlock; } - /* - * Update subsys masks and calculate what needs to be done. More - * subsystems than specified may need to be enabled or disabled - * depending on subsystem dependencies. - */ - old_sc = cgrp->subtree_control; - old_ss = cgrp->child_subsys_mask; - new_sc = (old_sc | enable) & ~disable; - new_ss = cgroup_calc_child_subsys_mask(cgrp, new_sc); - - css_enable = ~old_ss & new_ss; - css_disable = old_ss & ~new_ss; - enable |= css_enable; - disable |= css_disable; - - /* - * Because css offlining is asynchronous, userland might try to - * re-enable the same controller while the previous instance is - * still around. In such cases, wait till it's gone using - * offline_waitq. - */ - for_each_subsys_which(ss, ssid, &css_enable) { - cgroup_for_each_live_child(child, cgrp) { - DEFINE_WAIT(wait); - - if (!cgroup_css(child, ss)) - continue; - - cgroup_get(child); - prepare_to_wait(&child->offline_waitq, &wait, - TASK_UNINTERRUPTIBLE); - cgroup_kn_unlock(of->kn); - schedule(); - finish_wait(&child->offline_waitq, &wait); - cgroup_put(child); - - return restart_syscall(); - } - } - - cgrp->subtree_control = new_sc; - cgrp->child_subsys_mask = new_ss; - - /* - * Create new csses or make the existing ones visible. A css is - * created invisible if it's being implicitly enabled through - * dependency. An invisible css is made visible when the userland - * explicitly enables it. - */ - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - if (css_enable & (1 << ssid)) - ret = create_css(child, ss, - cgrp->subtree_control & (1 << ssid)); - else - ret = css_populate_dir(cgroup_css(child, ss), - NULL); - if (ret) - goto err_undo_css; - } - } - - /* - * At this point, cgroup_e_css() results reflect the new csses - * making the following cgroup_update_dfl_csses() properly update - * css associations of all tasks in the subtree. - */ - ret = cgroup_update_dfl_csses(cgrp); - if (ret) - goto err_undo_css; - - /* - * All tasks are migrated out of disabled csses. Kill or hide - * them. A css is hidden when the userland requests it to be - * disabled while other subsystems are still depending on it. The - * css must not actively control resources and be in the vanilla - * state if it's made visible again later. Controllers which may - * be depended upon should provide ->css_reset() for this purpose. - */ - for_each_subsys(ss, ssid) { - if (!(disable & (1 << ssid))) - continue; + /* save and update control masks and prepare csses */ + cgroup_save_control(cgrp); - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); + cgrp->subtree_control |= enable; + cgrp->subtree_control &= ~disable; - if (css_disable & (1 << ssid)) { - kill_css(css); - } else { - css_clear_dir(css, NULL); - if (ss->css_reset) - ss->css_reset(css); - } - } - } + ret = cgroup_apply_control(cgrp); - /* - * The effective csses of all the descendants (excluding @cgrp) may - * have changed. Subsystems can optionally subscribe to this event - * by implementing ->css_e_css_changed() which is invoked if any of - * the effective csses seen from the css's cgroup may have changed. - */ - for_each_subsys(ss, ssid) { - struct cgroup_subsys_state *this_css = cgroup_css(cgrp, ss); - struct cgroup_subsys_state *css; - - if (!ss->css_e_css_changed || !this_css) - continue; - - css_for_each_descendant_pre(css, this_css) - if (css != this_css) - ss->css_e_css_changed(css); - } + cgroup_finalize_control(cgrp, ret); kernfs_activate(cgrp->kn); ret = 0; out_unlock: cgroup_kn_unlock(of->kn); return ret ?: nbytes; - -err_undo_css: - cgrp->subtree_control = old_sc; - cgrp->child_subsys_mask = old_ss; - - for_each_subsys(ss, ssid) { - if (!(enable & (1 << ssid))) - continue; - - cgroup_for_each_live_child(child, cgrp) { - struct cgroup_subsys_state *css = cgroup_css(child, ss); - - if (!css) - continue; - - if (css_enable & (1 << ssid)) - kill_css(css); - else - css_clear_dir(css, NULL); - } - } - goto out_unlock; } static int cgroup_events_show(struct seq_file *seq, void *v) @@ -3313,9 +3577,9 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, if (cft->file_offset) { struct cgroup_file *cfile = (void *)css + cft->file_offset; - kernfs_get(kn); + spin_lock_irq(&cgroup_file_kn_lock); cfile->kn = kn; - list_add(&cfile->node, &css->files); + spin_unlock_irq(&cgroup_file_kn_lock); } return 0; @@ -3336,7 +3600,7 @@ static int cgroup_addrm_files(struct cgroup_subsys_state *css, bool is_add) { struct cftype *cft, *cft_end = NULL; - int ret; + int ret = 0; lockdep_assert_held(&cgroup_mutex); @@ -3365,7 +3629,7 @@ restart: cgroup_rm_file(cgrp, cft); } } - return 0; + return ret; } static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) @@ -3382,7 +3646,7 @@ static int cgroup_apply_cftypes(struct cftype *cfts, bool is_add) css_for_each_descendant_pre(css, cgroup_css(root, ss)) { struct cgroup *cgrp = css->cgroup; - if (cgroup_is_dead(cgrp)) + if (!(css->flags & CSS_VISIBLE)) continue; ret = cgroup_addrm_files(css, cgrp, cfts, is_add); @@ -3553,6 +3817,22 @@ int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) } /** + * cgroup_file_notify - generate a file modified event for a cgroup_file + * @cfile: target cgroup_file + * + * @cfile must have been obtained by setting cftype->file_offset. + */ +void cgroup_file_notify(struct cgroup_file *cfile) +{ + unsigned long flags; + + spin_lock_irqsave(&cgroup_file_kn_lock, flags); + if (cfile->kn) + kernfs_notify(cfile->kn); + spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); +} + +/** * cgroup_task_count - count the number of tasks in a cgroup. * @cgrp: the cgroup in question * @@ -3987,6 +4267,9 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) struct task_struct *task; int ret; + if (!cgroup_may_migrate_to(to)) + return -EBUSY; + mutex_lock(&cgroup_mutex); /* all tasks in @from are being moved, all csets are source */ @@ -3995,12 +4278,12 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) cgroup_migrate_add_src(link->cset, to, &preloaded_csets); spin_unlock_bh(&css_set_lock); - ret = cgroup_migrate_prepare_dst(to, &preloaded_csets); + ret = cgroup_migrate_prepare_dst(&preloaded_csets); if (ret) goto out_err; /* - * Migrate tasks one-by-one until @form is empty. This fails iff + * Migrate tasks one-by-one until @from is empty. This fails iff * ->can_attach() fails. */ do { @@ -4011,7 +4294,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from) css_task_iter_end(&it); if (task) { - ret = cgroup_migrate(task, false, to); + ret = cgroup_migrate(task, false, to->root); put_task_struct(task); } } while (task && !ret); @@ -4518,12 +4801,6 @@ static struct cftype cgroup_dfl_base_files[] = { }, { .name = "cgroup.controllers", - .flags = CFTYPE_ONLY_ON_ROOT, - .seq_show = cgroup_root_controllers_show, - }, - { - .name = "cgroup.controllers", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = cgroup_controllers_show, }, { @@ -4613,23 +4890,20 @@ static void css_free_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); struct cgroup_subsys *ss = css->ss; struct cgroup *cgrp = css->cgroup; - struct cgroup_file *cfile; percpu_ref_exit(&css->refcnt); - list_for_each_entry(cfile, &css->files, node) - kernfs_put(cfile->kn); - if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); @@ -4695,7 +4969,9 @@ static void css_release_work_fn(struct work_struct *work) * Those are supported by RCU protecting clearing of * cgrp->kn->priv backpointer. */ - RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, NULL); + if (cgrp->kn) + RCU_INIT_POINTER(*(void __rcu __force **)&cgrp->kn->priv, + NULL); } mutex_unlock(&cgroup_mutex); @@ -4724,8 +5000,8 @@ static void init_and_link_css(struct cgroup_subsys_state *css, css->ss = ss; INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); - INIT_LIST_HEAD(&css->files); css->serial_nr = css_serial_nr_next++; + atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4748,6 +5024,10 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); + + atomic_inc(&css->online_cnt); + if (css->parent) + atomic_inc(&css->parent->online_cnt); } return ret; } @@ -4762,6 +5042,9 @@ static void offline_css(struct cgroup_subsys_state *css) if (!(css->flags & CSS_ONLINE)) return; + if (ss->css_reset) + ss->css_reset(css); + if (ss->css_offline) ss->css_offline(css); @@ -4772,17 +5055,16 @@ static void offline_css(struct cgroup_subsys_state *css) } /** - * create_css - create a cgroup_subsys_state + * css_create - create a cgroup_subsys_state * @cgrp: the cgroup new css will be associated with * @ss: the subsys of new css - * @visible: whether to create control knobs for the new css or not * * Create a new css associated with @cgrp - @ss pair. On success, the new - * css is online and installed in @cgrp with all interface files created if - * @visible. Returns 0 on success, -errno on failure. + * css is online and installed in @cgrp. This function doesn't create the + * interface files. Returns 0 on success, -errno on failure. */ -static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, - bool visible) +static struct cgroup_subsys_state *css_create(struct cgroup *cgrp, + struct cgroup_subsys *ss) { struct cgroup *parent = cgroup_parent(cgrp); struct cgroup_subsys_state *parent_css = cgroup_css(parent, ss); @@ -4793,7 +5075,7 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, css = ss->css_alloc(parent_css); if (IS_ERR(css)) - return PTR_ERR(css); + return css; init_and_link_css(css, ss, cgrp); @@ -4806,12 +5088,6 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, goto err_free_percpu_ref; css->id = err; - if (visible) { - err = css_populate_dir(css, NULL); - if (err) - goto err_free_id; - } - /* @css is ready to be brought online now, make it visible */ list_add_tail_rcu(&css->sibling, &parent_css->children); cgroup_idr_replace(&ss->css_idr, css, css->id); @@ -4829,45 +5105,30 @@ static int create_css(struct cgroup *cgrp, struct cgroup_subsys *ss, ss->warned_broken_hierarchy = true; } - return 0; + return css; err_list_del: list_del_rcu(&css->sibling); - css_clear_dir(css, NULL); -err_free_id: cgroup_idr_remove(&ss->css_idr, css->id); err_free_percpu_ref: percpu_ref_exit(&css->refcnt); err_free_css: call_rcu(&css->rcu_head, css_free_rcu_fn); - return err; + return ERR_PTR(err); } -static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static struct cgroup *cgroup_create(struct cgroup *parent) { - struct cgroup *parent, *cgrp; - struct cgroup_root *root; - struct cgroup_subsys *ss; - struct kernfs_node *kn; - int ssid, ret; - - /* Do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable. - */ - if (strchr(name, '\n')) - return -EINVAL; - - parent = cgroup_kn_lock_live(parent_kn); - if (!parent) - return -ENODEV; - root = parent->root; + struct cgroup_root *root = parent->root; + struct cgroup *cgrp, *tcgrp; + int level = parent->level + 1; + int ret; /* allocate the cgroup and its ID, 0 is reserved for the root */ - cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); - if (!cgrp) { - ret = -ENOMEM; - goto out_unlock; - } + cgrp = kzalloc(sizeof(*cgrp) + + sizeof(cgrp->ancestor_ids[0]) * (level + 1), GFP_KERNEL); + if (!cgrp) + return ERR_PTR(-ENOMEM); ret = percpu_ref_init(&cgrp->self.refcnt, css_release, 0, GFP_KERNEL); if (ret) @@ -4887,6 +5148,10 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, cgrp->self.parent = &parent->self; cgrp->root = root; + cgrp->level = level; + + for (tcgrp = cgrp; tcgrp; tcgrp = cgroup_parent(tcgrp)) + cgrp->ancestor_ids[tcgrp->level] = tcgrp->id; if (notify_on_release(parent)) set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); @@ -4894,20 +5159,6 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); - /* create the directory */ - kn = kernfs_create_dir(parent->kn, name, mode, cgrp); - if (IS_ERR(kn)) { - ret = PTR_ERR(kn); - goto out_free_id; - } - cgrp->kn = kn; - - /* - * This extra ref will be put in cgroup_free_fn() and guarantees - * that @cgrp->kn is always accessible. - */ - kernfs_get(kn); - cgrp->self.serial_nr = css_serial_nr_next++; /* allocation complete, commit to creation */ @@ -4921,51 +5172,90 @@ static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, */ cgroup_idr_replace(&root->cgroup_idr, cgrp, cgrp->id); - ret = cgroup_kn_set_ugid(kn); - if (ret) - goto out_destroy; + /* + * On the default hierarchy, a child doesn't automatically inherit + * subtree_control from the parent. Each is configured manually. + */ + if (!cgroup_on_dfl(cgrp)) + cgrp->subtree_control = cgroup_control(cgrp); - ret = css_populate_dir(&cgrp->self, NULL); + cgroup_propagate_control(cgrp); + + /* @cgrp doesn't have dir yet so the following will only create csses */ + ret = cgroup_apply_control_enable(cgrp); if (ret) goto out_destroy; - /* let's create and online css's */ - for_each_subsys(ss, ssid) { - if (parent->child_subsys_mask & (1 << ssid)) { - ret = create_css(cgrp, ss, - parent->subtree_control & (1 << ssid)); - if (ret) - goto out_destroy; - } + return cgrp; + +out_cancel_ref: + percpu_ref_exit(&cgrp->self.refcnt); +out_free_cgrp: + kfree(cgrp); + return ERR_PTR(ret); +out_destroy: + cgroup_destroy_locked(cgrp); + return ERR_PTR(ret); +} + +static int cgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + struct cgroup *parent, *cgrp; + struct kernfs_node *kn; + int ret; + + /* do not accept '\n' to prevent making /proc/<pid>/cgroup unparsable */ + if (strchr(name, '\n')) + return -EINVAL; + + parent = cgroup_kn_lock_live(parent_kn, false); + if (!parent) + return -ENODEV; + + cgrp = cgroup_create(parent); + if (IS_ERR(cgrp)) { + ret = PTR_ERR(cgrp); + goto out_unlock; } + /* create the directory */ + kn = kernfs_create_dir(parent->kn, name, mode, cgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + goto out_destroy; + } + cgrp->kn = kn; + /* - * On the default hierarchy, a child doesn't automatically inherit - * subtree_control from the parent. Each is configured manually. + * This extra ref will be put in cgroup_free_fn() and guarantees + * that @cgrp->kn is always accessible. */ - if (!cgroup_on_dfl(cgrp)) { - cgrp->subtree_control = parent->subtree_control; - cgroup_refresh_child_subsys_mask(cgrp); - } + kernfs_get(kn); + ret = cgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + ret = css_populate_dir(&cgrp->self); + if (ret) + goto out_destroy; + + ret = cgroup_apply_control_enable(cgrp); + if (ret) + goto out_destroy; + + /* let's create and online css's */ kernfs_activate(kn); ret = 0; goto out_unlock; -out_free_id: - cgroup_idr_remove(&root->cgroup_idr, cgrp->id); -out_cancel_ref: - percpu_ref_exit(&cgrp->self.refcnt); -out_free_cgrp: - kfree(cgrp); +out_destroy: + cgroup_destroy_locked(cgrp); out_unlock: cgroup_kn_unlock(parent_kn); return ret; - -out_destroy: - cgroup_destroy_locked(cgrp); - goto out_unlock; } /* @@ -4979,10 +5269,15 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); - offline_css(css); - mutex_unlock(&cgroup_mutex); - css_put(css); + do { + offline_css(css); + css_put(css); + /* @css can't go away while we're holding cgroup_mutex */ + css = css->parent; + } while (css && atomic_dec_and_test(&css->online_cnt)); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -4991,8 +5286,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + if (atomic_dec_and_test(&css->online_cnt)) { + INIT_WORK(&css->destroy_work, css_killed_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); + } } /** @@ -5012,7 +5309,7 @@ static void kill_css(struct cgroup_subsys_state *css) * This must happen before css is disassociated with its cgroup. * See seq_css() for details. */ - css_clear_dir(css, NULL); + css_clear_dir(css); /* * Killing would put the base ref, but we need to keep it alive @@ -5061,6 +5358,7 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) __releases(&cgroup_mutex) __acquires(&cgroup_mutex) { struct cgroup_subsys_state *css; + struct cgrp_cset_link *link; int ssid; lockdep_assert_held(&cgroup_mutex); @@ -5081,11 +5379,18 @@ static int cgroup_destroy_locked(struct cgroup *cgrp) return -EBUSY; /* - * Mark @cgrp dead. This prevents further task migration and child - * creation by disabling cgroup_lock_live_group(). + * Mark @cgrp and the associated csets dead. The former prevents + * further task migration and child creation by disabling + * cgroup_lock_live_group(). The latter makes the csets ignored by + * the migration path. */ cgrp->self.flags &= ~CSS_ONLINE; + spin_lock_bh(&css_set_lock); + list_for_each_entry(link, &cgrp->cset_links, cset_link) + link->cset->dead = true; + spin_unlock_bh(&css_set_lock); + /* initiate massacre of all css's */ for_each_css(css, ssid, cgrp) kill_css(css); @@ -5109,7 +5414,7 @@ static int cgroup_rmdir(struct kernfs_node *kn) struct cgroup *cgrp; int ret = 0; - cgrp = cgroup_kn_lock_live(kn); + cgrp = cgroup_kn_lock_live(kn, false); if (!cgrp) return 0; @@ -5131,7 +5436,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) { struct cgroup_subsys_state *css; - printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); + pr_debug("Initializing cgroup subsys %s\n", ss->name); mutex_lock(&cgroup_mutex); @@ -5199,7 +5504,7 @@ int __init cgroup_init_early(void) for_each_subsys(ss, i) { WARN(!ss->css_alloc || !ss->css_free || ss->name || ss->id, - "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p name:id=%d:%s\n", + "invalid cgroup_subsys %d:%s css_alloc=%p css_free=%p id:name=%d:%s\n", i, cgroup_subsys_name[i], ss->css_alloc, ss->css_free, ss->id, ss->name); WARN(strlen(cgroup_subsys_name[i]) > MAX_CGROUP_TYPE_NAMELEN, @@ -5216,7 +5521,7 @@ int __init cgroup_init_early(void) return 0; } -static unsigned long cgroup_disable_mask __initdata; +static u16 cgroup_disable_mask __initdata; /** * cgroup_init - cgroup initialization @@ -5227,18 +5532,23 @@ static unsigned long cgroup_disable_mask __initdata; int __init cgroup_init(void) { struct cgroup_subsys *ss; - unsigned long key; int ssid; + BUILD_BUG_ON(CGROUP_SUBSYS_COUNT > 16); BUG_ON(percpu_init_rwsem(&cgroup_threadgroup_rwsem)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_dfl_base_files)); BUG_ON(cgroup_init_cftypes(NULL, cgroup_legacy_base_files)); + get_user_ns(init_cgroup_ns.user_ns); + mutex_lock(&cgroup_mutex); - /* Add init_css_set to the hash table */ - key = css_set_hash(init_css_set.subsys); - hash_add(css_set_table, &init_css_set.hlist, key); + /* + * Add init_css_set to the hash table so that dfl_root can link to + * it during init. + */ + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); BUG_ON(cgroup_setup_root(&cgrp_dfl_root, 0)); @@ -5271,10 +5581,16 @@ int __init cgroup_init(void) continue; } + if (cgroup_ssid_no_v1(ssid)) + printk(KERN_INFO "Disabling %s control group subsystem in v1 mounts\n", + ss->name); + cgrp_dfl_root.subsys_mask |= 1 << ss->id; - if (!ss->dfl_cftypes) - cgrp_dfl_root_inhibit_ss_mask |= 1 << ss->id; + if (ss->implicit_on_dfl) + cgrp_dfl_implicit_ss_mask |= 1 << ss->id; + else if (!ss->dfl_cftypes) + cgrp_dfl_inhibit_ss_mask |= 1 << ss->id; if (ss->dfl_cftypes == ss->legacy_cftypes) { WARN_ON(cgroup_add_cftypes(ss, ss->dfl_cftypes)); @@ -5287,8 +5603,14 @@ int __init cgroup_init(void) ss->bind(init_css_set.subsys[ssid]); } + /* init_css_set.subsys[] has been updated, re-hash */ + hash_del(&init_css_set.hlist); + hash_add(css_set_table, &init_css_set.hlist, + css_set_hash(init_css_set.subsys)); + WARN_ON(sysfs_create_mount_point(fs_kobj, "cgroup")); WARN_ON(register_filesystem(&cgroup_fs_type)); + WARN_ON(register_filesystem(&cgroup2_fs_type)); WARN_ON(!proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations)); return 0; @@ -5344,7 +5666,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, struct cgroup *cgrp; int ssid, count = 0; - if (root == &cgrp_dfl_root && !cgrp_dfl_root_visible) + if (root == &cgrp_dfl_root && !cgrp_dfl_visible) continue; seq_printf(m, "%d:", root->hierarchy_id); @@ -5370,7 +5692,8 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns, * " (deleted)" is appended to the cgroup path. */ if (cgroup_on_dfl(cgrp) || !(tsk->flags & PF_EXITING)) { - path = cgroup_path(cgrp, buf, PATH_MAX); + path = cgroup_path_ns_locked(cgrp, buf, PATH_MAX, + current->nsproxy->cgroup_ns); if (!path) { retval = -ENAMETOOLONG; goto out_unlock; @@ -5432,19 +5755,6 @@ static const struct file_operations proc_cgroupstats_operations = { .release = single_release, }; -static void **subsys_canfork_priv_p(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - if (CGROUP_CANFORK_START <= i && i < CGROUP_CANFORK_END) - return &ss_priv[i - CGROUP_CANFORK_START]; - return NULL; -} - -static void *subsys_canfork_priv(void *ss_priv[CGROUP_CANFORK_COUNT], int i) -{ - void **private = subsys_canfork_priv_p(ss_priv, i); - return private ? *private : NULL; -} - /** * cgroup_fork - initialize cgroup related fields during copy_process() * @child: pointer to task_struct of forking parent process. @@ -5467,17 +5777,16 @@ void cgroup_fork(struct task_struct *child) * returns an error, the fork aborts with that error code. This allows for * a cgroup subsystem to conditionally allow or deny new forks. */ -int cgroup_can_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +int cgroup_can_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i, j, ret; - for_each_subsys_which(ss, i, &have_canfork_callback) { - ret = ss->can_fork(child, subsys_canfork_priv_p(ss_priv, i)); + do_each_subsys_mask(ss, i, have_canfork_callback) { + ret = ss->can_fork(child); if (ret) goto out_revert; - } + } while_each_subsys_mask(); return 0; @@ -5486,7 +5795,7 @@ out_revert: if (j >= i) break; if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, j)); + ss->cancel_fork(child); } return ret; @@ -5499,15 +5808,14 @@ out_revert: * This calls the cancel_fork() callbacks if a fork failed *after* * cgroup_can_fork() succeded. */ -void cgroup_cancel_fork(struct task_struct *child, - void *ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_cancel_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; for_each_subsys(ss, i) if (ss->cancel_fork) - ss->cancel_fork(child, subsys_canfork_priv(ss_priv, i)); + ss->cancel_fork(child); } /** @@ -5520,8 +5828,7 @@ void cgroup_cancel_fork(struct task_struct *child, * cgroup_task_iter_start() - to guarantee that the new task ends up on its * list. */ -void cgroup_post_fork(struct task_struct *child, - void *old_ss_priv[CGROUP_CANFORK_COUNT]) +void cgroup_post_fork(struct task_struct *child) { struct cgroup_subsys *ss; int i; @@ -5564,8 +5871,9 @@ void cgroup_post_fork(struct task_struct *child, * css_set; otherwise, @child might change state between ->fork() * and addition to css_set. */ - for_each_subsys_which(ss, i, &have_fork_callback) - ss->fork(child, subsys_canfork_priv(old_ss_priv, i)); + do_each_subsys_mask(ss, i, have_fork_callback) { + ss->fork(child); + } while_each_subsys_mask(); } /** @@ -5608,8 +5916,9 @@ void cgroup_exit(struct task_struct *tsk) } /* see cgroup_post_fork() for details */ - for_each_subsys_which(ss, i, &have_exit_callback) + do_each_subsys_mask(ss, i, have_exit_callback) { ss->exit(tsk); + } while_each_subsys_mask(); } void cgroup_free(struct task_struct *task) @@ -5618,8 +5927,9 @@ void cgroup_free(struct task_struct *task) struct cgroup_subsys *ss; int ssid; - for_each_subsys_which(ss, ssid, &have_free_callback) + do_each_subsys_mask(ss, ssid, have_free_callback) { ss->free(task); + } while_each_subsys_mask(); put_css_set(cset); } @@ -5668,7 +5978,9 @@ static void cgroup_release_agent(struct work_struct *work) if (!pathbuf || !agentbuf) goto out; - path = cgroup_path(cgrp, pathbuf, PATH_MAX); + spin_lock_bh(&css_set_lock); + path = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns); + spin_unlock_bh(&css_set_lock); if (!path) goto out; @@ -5712,6 +6024,33 @@ static int __init cgroup_disable(char *str) } __setup("cgroup_disable=", cgroup_disable); +static int __init cgroup_no_v1(char *str) +{ + struct cgroup_subsys *ss; + char *token; + int i; + + while ((token = strsep(&str, ",")) != NULL) { + if (!*token) + continue; + + if (!strcmp(token, "all")) { + cgroup_no_v1_mask = U16_MAX; + break; + } + + for_each_subsys(ss, i) { + if (strcmp(token, ss->name) && + strcmp(token, ss->legacy_name)) + continue; + + cgroup_no_v1_mask |= 1 << i; + } + } + return 1; +} +__setup("cgroup_no_v1=", cgroup_no_v1); + /** * css_tryget_online_from_dir - get corresponding css from a cgroup dentry * @dentry: directory dentry of interest @@ -5725,12 +6064,13 @@ struct cgroup_subsys_state *css_tryget_online_from_dir(struct dentry *dentry, struct cgroup_subsys *ss) { struct kernfs_node *kn = kernfs_node_from_dentry(dentry); + struct file_system_type *s_type = dentry->d_sb->s_type; struct cgroup_subsys_state *css = NULL; struct cgroup *cgrp; /* is @dentry a cgroup dir? */ - if (dentry->d_sb->s_type != &cgroup_fs_type || !kn || - kernfs_type(kn) != KERNFS_DIR) + if ((s_type != &cgroup_fs_type && s_type != &cgroup2_fs_type) || + !kn || kernfs_type(kn) != KERNFS_DIR) return ERR_PTR(-EBADF); rcu_read_lock(); @@ -5765,6 +6105,220 @@ struct cgroup_subsys_state *css_from_id(int id, struct cgroup_subsys *ss) return id > 0 ? idr_find(&ss->css_idr, id) : NULL; } +/** + * cgroup_get_from_path - lookup and get a cgroup from its default hierarchy path + * @path: path on the default hierarchy + * + * Find the cgroup at @path on the default hierarchy, increment its + * reference count and return it. Returns pointer to the found cgroup on + * success, ERR_PTR(-ENOENT) if @path doens't exist and ERR_PTR(-ENOTDIR) + * if @path points to a non-directory. + */ +struct cgroup *cgroup_get_from_path(const char *path) +{ + struct kernfs_node *kn; + struct cgroup *cgrp; + + mutex_lock(&cgroup_mutex); + + kn = kernfs_walk_and_get(cgrp_dfl_root.cgrp.kn, path); + if (kn) { + if (kernfs_type(kn) == KERNFS_DIR) { + cgrp = kn->priv; + cgroup_get(cgrp); + } else { + cgrp = ERR_PTR(-ENOTDIR); + } + kernfs_put(kn); + } else { + cgrp = ERR_PTR(-ENOENT); + } + + mutex_unlock(&cgroup_mutex); + return cgrp; +} +EXPORT_SYMBOL_GPL(cgroup_get_from_path); + +/* + * sock->sk_cgrp_data handling. For more info, see sock_cgroup_data + * definition in cgroup-defs.h. + */ +#ifdef CONFIG_SOCK_CGROUP_DATA + +#if defined(CONFIG_CGROUP_NET_PRIO) || defined(CONFIG_CGROUP_NET_CLASSID) + +DEFINE_SPINLOCK(cgroup_sk_update_lock); +static bool cgroup_sk_alloc_disabled __read_mostly; + +void cgroup_sk_alloc_disable(void) +{ + if (cgroup_sk_alloc_disabled) + return; + pr_info("cgroup: disabling cgroup2 socket matching due to net_prio or net_cls activation\n"); + cgroup_sk_alloc_disabled = true; +} + +#else + +#define cgroup_sk_alloc_disabled false + +#endif + +void cgroup_sk_alloc(struct sock_cgroup_data *skcd) +{ + if (cgroup_sk_alloc_disabled) + return; + + rcu_read_lock(); + + while (true) { + struct css_set *cset; + + cset = task_css_set(current); + if (likely(cgroup_tryget(cset->dfl_cgrp))) { + skcd->val = (unsigned long)cset->dfl_cgrp; + break; + } + cpu_relax(); + } + + rcu_read_unlock(); +} + +void cgroup_sk_free(struct sock_cgroup_data *skcd) +{ + cgroup_put(sock_cgroup_ptr(skcd)); +} + +#endif /* CONFIG_SOCK_CGROUP_DATA */ + +/* cgroup namespaces */ + +static struct cgroup_namespace *alloc_cgroup_ns(void) +{ + struct cgroup_namespace *new_ns; + int ret; + + new_ns = kzalloc(sizeof(struct cgroup_namespace), GFP_KERNEL); + if (!new_ns) + return ERR_PTR(-ENOMEM); + ret = ns_alloc_inum(&new_ns->ns); + if (ret) { + kfree(new_ns); + return ERR_PTR(ret); + } + atomic_set(&new_ns->count, 1); + new_ns->ns.ops = &cgroupns_operations; + return new_ns; +} + +void free_cgroup_ns(struct cgroup_namespace *ns) +{ + put_css_set(ns->root_cset); + put_user_ns(ns->user_ns); + ns_free_inum(&ns->ns); + kfree(ns); +} +EXPORT_SYMBOL(free_cgroup_ns); + +struct cgroup_namespace *copy_cgroup_ns(unsigned long flags, + struct user_namespace *user_ns, + struct cgroup_namespace *old_ns) +{ + struct cgroup_namespace *new_ns; + struct css_set *cset; + + BUG_ON(!old_ns); + + if (!(flags & CLONE_NEWCGROUP)) { + get_cgroup_ns(old_ns); + return old_ns; + } + + /* Allow only sysadmin to create cgroup namespace. */ + if (!ns_capable(user_ns, CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + + mutex_lock(&cgroup_mutex); + spin_lock_bh(&css_set_lock); + + cset = task_css_set(current); + get_css_set(cset); + + spin_unlock_bh(&css_set_lock); + mutex_unlock(&cgroup_mutex); + + new_ns = alloc_cgroup_ns(); + if (IS_ERR(new_ns)) { + put_css_set(cset); + return new_ns; + } + + new_ns->user_ns = get_user_ns(user_ns); + new_ns->root_cset = cset; + + return new_ns; +} + +static inline struct cgroup_namespace *to_cg_ns(struct ns_common *ns) +{ + return container_of(ns, struct cgroup_namespace, ns); +} + +static int cgroupns_install(struct nsproxy *nsproxy, struct ns_common *ns) +{ + struct cgroup_namespace *cgroup_ns = to_cg_ns(ns); + + if (!ns_capable(current_user_ns(), CAP_SYS_ADMIN) || + !ns_capable(cgroup_ns->user_ns, CAP_SYS_ADMIN)) + return -EPERM; + + /* Don't need to do anything if we are attaching to our own cgroupns. */ + if (cgroup_ns == nsproxy->cgroup_ns) + return 0; + + get_cgroup_ns(cgroup_ns); + put_cgroup_ns(nsproxy->cgroup_ns); + nsproxy->cgroup_ns = cgroup_ns; + + return 0; +} + +static struct ns_common *cgroupns_get(struct task_struct *task) +{ + struct cgroup_namespace *ns = NULL; + struct nsproxy *nsproxy; + + task_lock(task); + nsproxy = task->nsproxy; + if (nsproxy) { + ns = nsproxy->cgroup_ns; + get_cgroup_ns(ns); + } + task_unlock(task); + + return ns ? &ns->ns : NULL; +} + +static void cgroupns_put(struct ns_common *ns) +{ + put_cgroup_ns(to_cg_ns(ns)); +} + +const struct proc_ns_operations cgroupns_operations = { + .name = "cgroup", + .type = CLONE_NEWCGROUP, + .get = cgroupns_get, + .put = cgroupns_put, + .install = cgroupns_install, +}; + +static __init int cgroup_namespaces_init(void) +{ + return 0; +} +subsys_initcall(cgroup_namespaces_init); + #ifdef CONFIG_CGROUP_DEBUG static struct cgroup_subsys_state * debug_css_alloc(struct cgroup_subsys_state *parent_css) diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index f1b30ad5dc6d..1b72d56edce5 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -155,12 +155,10 @@ static void freezer_css_free(struct cgroup_subsys_state *css) * @freezer->lock. freezer_attach() makes the new tasks conform to the * current state and all following state changes can see the new tasks. */ -static void freezer_attach(struct cgroup_subsys_state *new_css, - struct cgroup_taskset *tset) +static void freezer_attach(struct cgroup_taskset *tset) { - struct freezer *freezer = css_freezer(new_css); struct task_struct *task; - bool clear_frozen = false; + struct cgroup_subsys_state *new_css; mutex_lock(&freezer_mutex); @@ -174,22 +172,21 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * current state before executing the following - !frozen tasks may * be visible in a FROZEN cgroup and frozen tasks in a THAWED one. */ - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, new_css, tset) { + struct freezer *freezer = css_freezer(new_css); + if (!(freezer->state & CGROUP_FREEZING)) { __thaw_task(task); } else { freeze_task(task); - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = true; + /* clear FROZEN and propagate upwards */ + while (freezer && (freezer->state & CGROUP_FROZEN)) { + freezer->state &= ~CGROUP_FROZEN; + freezer = parent_freezer(freezer); + } } } - /* propagate FROZEN clearing upwards */ - while (clear_frozen && (freezer = parent_freezer(freezer))) { - freezer->state &= ~CGROUP_FROZEN; - clear_frozen = freezer->state & CGROUP_FREEZING; - } - mutex_unlock(&freezer_mutex); } @@ -203,7 +200,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css, * to do anything as freezer_attach() will put @task into the appropriate * state. */ -static void freezer_fork(struct task_struct *task, void *private) +static void freezer_fork(struct task_struct *task) { struct freezer *freezer; diff --git a/kernel/cgroup_pids.c b/kernel/cgroup_pids.c index cdd8df4e991c..303097b37429 100644 --- a/kernel/cgroup_pids.c +++ b/kernel/cgroup_pids.c @@ -106,7 +106,7 @@ static void pids_uncharge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; p; p = parent_pids(p)) + for (p = pids; parent_pids(p); p = parent_pids(p)) pids_cancel(p, num); } @@ -123,7 +123,7 @@ static void pids_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p; - for (p = pids; p; p = parent_pids(p)) + for (p = pids; parent_pids(p); p = parent_pids(p)) atomic64_add(num, &p->counter); } @@ -134,13 +134,13 @@ static void pids_charge(struct pids_cgroup *pids, int num) * * This function follows the set limit. It will fail if the charge would cause * the new value to exceed the hierarchical limit. Returns 0 if the charge - * succeded, otherwise -EAGAIN. + * succeeded, otherwise -EAGAIN. */ static int pids_try_charge(struct pids_cgroup *pids, int num) { struct pids_cgroup *p, *q; - for (p = pids; p; p = parent_pids(p)) { + for (p = pids; parent_pids(p); p = parent_pids(p)) { int64_t new = atomic64_add_return(num, &p->counter); /* @@ -162,13 +162,13 @@ revert: return -EAGAIN; } -static int pids_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int pids_can_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -187,13 +187,13 @@ static int pids_can_attach(struct cgroup_subsys_state *css, return 0; } -static void pids_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void pids_cancel_attach(struct cgroup_taskset *tset) { - struct pids_cgroup *pids = css_pids(css); struct task_struct *task; + struct cgroup_subsys_state *dst_css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, dst_css, tset) { + struct pids_cgroup *pids = css_pids(dst_css); struct cgroup_subsys_state *old_css; struct pids_cgroup *old_pids; @@ -205,65 +205,28 @@ static void pids_cancel_attach(struct cgroup_subsys_state *css, } } -static int pids_can_fork(struct task_struct *task, void **priv_p) +/* + * task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies + * on threadgroup_change_begin() held by the copy_process(). + */ +static int pids_can_fork(struct task_struct *task) { struct cgroup_subsys_state *css; struct pids_cgroup *pids; - int err; - /* - * Use the "current" task_css for the pids subsystem as the tentative - * css. It is possible we will charge the wrong hierarchy, in which - * case we will forcefully revert/reapply the charge on the right - * hierarchy after it is committed to the task proper. - */ - css = task_get_css(current, pids_cgrp_id); + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - - err = pids_try_charge(pids, 1); - if (err) - goto err_css_put; - - *priv_p = css; - return 0; - -err_css_put: - css_put(css); - return err; + return pids_try_charge(pids, 1); } -static void pids_cancel_fork(struct task_struct *task, void *priv) -{ - struct cgroup_subsys_state *css = priv; - struct pids_cgroup *pids = css_pids(css); - - pids_uncharge(pids, 1); - css_put(css); -} - -static void pids_fork(struct task_struct *task, void *priv) +static void pids_cancel_fork(struct task_struct *task) { struct cgroup_subsys_state *css; - struct cgroup_subsys_state *old_css = priv; struct pids_cgroup *pids; - struct pids_cgroup *old_pids = css_pids(old_css); - css = task_get_css(task, pids_cgrp_id); + css = task_css_check(current, pids_cgrp_id, true); pids = css_pids(css); - - /* - * If the association has changed, we have to revert and reapply the - * charge/uncharge on the wrong hierarchy to the current one. Since - * the association can only change due to an organisation event, its - * okay for us to ignore the limit in this case. - */ - if (pids != old_pids) { - pids_uncharge(old_pids, 1); - pids_charge(pids, 1); - } - - css_put(css); - css_put(old_css); + pids_uncharge(pids, 1); } static void pids_free(struct task_struct *task) @@ -335,6 +298,7 @@ static struct cftype pids_files[] = { { .name = "current", .read_s64 = pids_current_read, + .flags = CFTYPE_NOT_ON_ROOT, }, { } /* terminate */ }; @@ -346,7 +310,6 @@ struct cgroup_subsys pids_cgrp_subsys = { .cancel_attach = pids_cancel_attach, .can_fork = pids_can_fork, .cancel_fork = pids_cancel_fork, - .fork = pids_fork, .free = pids_free, .legacy_cftypes = pids_files, .dfl_cftypes = pids_files, diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 0a495ab35bc7..9ad37b9e44a7 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -24,7 +24,7 @@ #define CREATE_TRACE_POINTS #include <trace/events/context_tracking.h> -struct static_key context_tracking_enabled = STATIC_KEY_INIT_FALSE; +DEFINE_STATIC_KEY_FALSE(context_tracking_enabled); EXPORT_SYMBOL_GPL(context_tracking_enabled); DEFINE_PER_CPU(struct context_tracking, context_tracking); @@ -58,36 +58,13 @@ static void context_tracking_recursion_exit(void) * instructions to execute won't use any RCU read side critical section * because this function sets RCU in extended quiescent state. */ -void context_tracking_enter(enum ctx_state state) +void __context_tracking_enter(enum ctx_state state) { - unsigned long flags; - - /* - * Repeat the user_enter() check here because some archs may be calling - * this from asm and if no CPU needs context tracking, they shouldn't - * go further. Repeat the check here until they support the inline static - * key check. - */ - if (!context_tracking_is_enabled()) - return; - - /* - * Some contexts may involve an exception occuring in an irq, - * leading to that nesting: - * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() - * This would mess up the dyntick_nesting count though. And rcu_irq_*() - * helpers are enough to protect RCU uses inside the exception. So - * just return immediately if we detect we are in an IRQ. - */ - if (in_interrupt()) - return; - /* Kernel threads aren't supposed to go to userspace */ WARN_ON_ONCE(!current->mm); - local_irq_save(flags); if (!context_tracking_recursion_enter()) - goto out_irq_restore; + return; if ( __this_cpu_read(context_tracking.state) != state) { if (__this_cpu_read(context_tracking.active)) { @@ -120,7 +97,27 @@ void context_tracking_enter(enum ctx_state state) __this_cpu_write(context_tracking.state, state); } context_tracking_recursion_exit(); -out_irq_restore: +} +NOKPROBE_SYMBOL(__context_tracking_enter); +EXPORT_SYMBOL_GPL(__context_tracking_enter); + +void context_tracking_enter(enum ctx_state state) +{ + unsigned long flags; + + /* + * Some contexts may involve an exception occuring in an irq, + * leading to that nesting: + * rcu_irq_enter() rcu_user_exit() rcu_user_exit() rcu_irq_exit() + * This would mess up the dyntick_nesting count though. And rcu_irq_*() + * helpers are enough to protect RCU uses inside the exception. So + * just return immediately if we detect we are in an IRQ. + */ + if (in_interrupt()) + return; + + local_irq_save(flags); + __context_tracking_enter(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_enter); @@ -128,7 +125,7 @@ EXPORT_SYMBOL_GPL(context_tracking_enter); void context_tracking_user_enter(void) { - context_tracking_enter(CONTEXT_USER); + user_enter(); } NOKPROBE_SYMBOL(context_tracking_user_enter); @@ -144,19 +141,10 @@ NOKPROBE_SYMBOL(context_tracking_user_enter); * This call supports re-entrancy. This way it can be called from any exception * handler without needing to know if we came from userspace or not. */ -void context_tracking_exit(enum ctx_state state) +void __context_tracking_exit(enum ctx_state state) { - unsigned long flags; - - if (!context_tracking_is_enabled()) - return; - - if (in_interrupt()) - return; - - local_irq_save(flags); if (!context_tracking_recursion_enter()) - goto out_irq_restore; + return; if (__this_cpu_read(context_tracking.state) == state) { if (__this_cpu_read(context_tracking.active)) { @@ -173,7 +161,19 @@ void context_tracking_exit(enum ctx_state state) __this_cpu_write(context_tracking.state, CONTEXT_KERNEL); } context_tracking_recursion_exit(); -out_irq_restore: +} +NOKPROBE_SYMBOL(__context_tracking_exit); +EXPORT_SYMBOL_GPL(__context_tracking_exit); + +void context_tracking_exit(enum ctx_state state) +{ + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + __context_tracking_exit(state); local_irq_restore(flags); } NOKPROBE_SYMBOL(context_tracking_exit); @@ -181,7 +181,7 @@ EXPORT_SYMBOL_GPL(context_tracking_exit); void context_tracking_user_exit(void) { - context_tracking_exit(CONTEXT_USER); + user_exit(); } NOKPROBE_SYMBOL(context_tracking_user_exit); @@ -191,7 +191,7 @@ void __init context_tracking_cpu_set(int cpu) if (!per_cpu(context_tracking.active, cpu)) { per_cpu(context_tracking.active, cpu) = true; - static_key_slow_inc(&context_tracking_enabled); + static_branch_inc(&context_tracking_enabled); } if (initialized) diff --git a/kernel/cpu.c b/kernel/cpu.c index 85ff5e26e23b..6ea42e8da861 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -22,13 +22,88 @@ #include <linux/lockdep.h> #include <linux/tick.h> #include <linux/irq.h> +#include <linux/smpboot.h> + #include <trace/events/power.h> +#define CREATE_TRACE_POINTS +#include <trace/events/cpuhp.h> #include "smpboot.h" +/** + * cpuhp_cpu_state - Per cpu hotplug state storage + * @state: The current cpu state + * @target: The target state + * @thread: Pointer to the hotplug thread + * @should_run: Thread should execute + * @cb_stat: The state for a single callback (install/uninstall) + * @cb: Single callback function (install/uninstall) + * @result: Result of the operation + * @done: Signal completion to the issuer of the task + */ +struct cpuhp_cpu_state { + enum cpuhp_state state; + enum cpuhp_state target; +#ifdef CONFIG_SMP + struct task_struct *thread; + bool should_run; + enum cpuhp_state cb_state; + int (*cb)(unsigned int cpu); + int result; + struct completion done; +#endif +}; + +static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state); + +/** + * cpuhp_step - Hotplug state machine step + * @name: Name of the step + * @startup: Startup function of the step + * @teardown: Teardown function of the step + * @skip_onerr: Do not invoke the functions on error rollback + * Will go away once the notifiers are gone + * @cant_stop: Bringup/teardown can't be stopped at this step + */ +struct cpuhp_step { + const char *name; + int (*startup)(unsigned int cpu); + int (*teardown)(unsigned int cpu); + bool skip_onerr; + bool cant_stop; +}; + +static DEFINE_MUTEX(cpuhp_state_mutex); +static struct cpuhp_step cpuhp_bp_states[]; +static struct cpuhp_step cpuhp_ap_states[]; + +/** + * cpuhp_invoke_callback _ Invoke the callbacks for a given state + * @cpu: The cpu for which the callback should be invoked + * @step: The step in the state machine + * @cb: The callback function to invoke + * + * Called from cpu hotplug and from the state register machinery + */ +static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state step, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int ret = 0; + + if (cb) { + trace_cpuhp_enter(cpu, st->target, step, cb); + ret = cb(cpu); + trace_cpuhp_exit(cpu, st->state, step, ret); + } + return ret; +} + #ifdef CONFIG_SMP /* Serializes the updates to cpu_online_mask, cpu_present_mask */ static DEFINE_MUTEX(cpu_add_remove_lock); +bool cpuhp_tasks_frozen; +EXPORT_SYMBOL_GPL(cpuhp_tasks_frozen); /* * The following two APIs (cpu_maps_update_begin/done) must be used when @@ -207,31 +282,281 @@ int __register_cpu_notifier(struct notifier_block *nb) return raw_notifier_chain_register(&cpu_chain, nb); } -static int __cpu_notify(unsigned long val, void *v, int nr_to_call, +static int __cpu_notify(unsigned long val, unsigned int cpu, int nr_to_call, int *nr_calls) { + unsigned long mod = cpuhp_tasks_frozen ? CPU_TASKS_FROZEN : 0; + void *hcpu = (void *)(long)cpu; + int ret; - ret = __raw_notifier_call_chain(&cpu_chain, val, v, nr_to_call, + ret = __raw_notifier_call_chain(&cpu_chain, val | mod, hcpu, nr_to_call, nr_calls); return notifier_to_errno(ret); } -static int cpu_notify(unsigned long val, void *v) +static int cpu_notify(unsigned long val, unsigned int cpu) { - return __cpu_notify(val, v, -1, NULL); + return __cpu_notify(val, cpu, -1, NULL); } -#ifdef CONFIG_HOTPLUG_CPU +/* Notifier wrappers for transitioning to state machine */ +static int notify_prepare(unsigned int cpu) +{ + int nr_calls = 0; + int ret; -static void cpu_notify_nofail(unsigned long val, void *v) + ret = __cpu_notify(CPU_UP_PREPARE, cpu, -1, &nr_calls); + if (ret) { + nr_calls--; + printk(KERN_WARNING "%s: attempt to bring up CPU %u failed\n", + __func__, cpu); + __cpu_notify(CPU_UP_CANCELED, cpu, nr_calls, NULL); + } + return ret; +} + +static int notify_online(unsigned int cpu) +{ + cpu_notify(CPU_ONLINE, cpu); + return 0; +} + +static int notify_starting(unsigned int cpu) { - BUG_ON(cpu_notify(val, v)); + cpu_notify(CPU_STARTING, cpu); + return 0; } + +static int bringup_wait_for_ap(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + wait_for_completion(&st->done); + return st->result; +} + +static int bringup_cpu(unsigned int cpu) +{ + struct task_struct *idle = idle_thread_get(cpu); + int ret; + + /* Arch-specific enabling code. */ + ret = __cpu_up(cpu, idle); + if (ret) { + cpu_notify(CPU_UP_CANCELED, cpu); + return ret; + } + ret = bringup_wait_for_ap(cpu); + BUG_ON(!cpu_online(cpu)); + return ret; +} + +/* + * Hotplug state machine related functions + */ +static void undo_cpu_down(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state++; st->state < st->target; st->state++) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->startup); + } +} + +static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + for (; st->state > target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + ret = cpuhp_invoke_callback(cpu, st->state, step->teardown); + if (ret) { + st->target = prev_state; + undo_cpu_down(cpu, st, steps); + break; + } + } + return ret; +} + +static void undo_cpu_up(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps) +{ + for (st->state--; st->state > st->target; st->state--) { + struct cpuhp_step *step = steps + st->state; + + if (!step->skip_onerr) + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } +} + +static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st, + struct cpuhp_step *steps, enum cpuhp_state target) +{ + enum cpuhp_state prev_state = st->state; + int ret = 0; + + while (st->state < target) { + struct cpuhp_step *step; + + st->state++; + step = steps + st->state; + ret = cpuhp_invoke_callback(cpu, st->state, step->startup); + if (ret) { + st->target = prev_state; + undo_cpu_up(cpu, st, steps); + break; + } + } + return ret; +} + +/* + * The cpu hotplug threads manage the bringup and teardown of the cpus + */ +static void cpuhp_create(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + init_completion(&st->done); +} + +static int cpuhp_should_run(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + return st->should_run; +} + +/* Execute the teardown callbacks. Used to be CPU_DOWN_PREPARE */ +static int cpuhp_ap_offline(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + enum cpuhp_state target = max((int)st->target, CPUHP_TEARDOWN_CPU); + + return cpuhp_down_callbacks(cpu, st, cpuhp_ap_states, target); +} + +/* Execute the online startup callbacks. Used to be CPU_ONLINE */ +static int cpuhp_ap_online(unsigned int cpu, struct cpuhp_cpu_state *st) +{ + return cpuhp_up_callbacks(cpu, st, cpuhp_ap_states, st->target); +} + +/* + * Execute teardown/startup callbacks on the plugged cpu. Also used to invoke + * callbacks when a state gets [un]installed at runtime. + */ +static void cpuhp_thread_fun(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + int ret = 0; + + /* + * Paired with the mb() in cpuhp_kick_ap_work and + * cpuhp_invoke_ap_callback, so the work set is consistent visible. + */ + smp_mb(); + if (!st->should_run) + return; + + st->should_run = false; + + /* Single callback invocation for [un]install ? */ + if (st->cb) { + if (st->cb_state < CPUHP_AP_ONLINE) { + local_irq_disable(); + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + local_irq_enable(); + } else { + ret = cpuhp_invoke_callback(cpu, st->cb_state, st->cb); + } + } else { + /* Cannot happen .... */ + BUG_ON(st->state < CPUHP_AP_ONLINE_IDLE); + + /* Regular hotplug work */ + if (st->state < st->target) + ret = cpuhp_ap_online(cpu, st); + else if (st->state > st->target) + ret = cpuhp_ap_offline(cpu, st); + } + st->result = ret; + complete(&st->done); +} + +/* Invoke a single callback on a remote cpu */ +static int cpuhp_invoke_ap_callback(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int)) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + + if (!cpu_online(cpu)) + return 0; + + st->cb_state = state; + st->cb = cb; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); + wait_for_completion(&st->done); + return st->result; +} + +/* Regular hotplug invocation of the AP hotplug thread */ +static void __cpuhp_kick_ap_work(struct cpuhp_cpu_state *st) +{ + st->result = 0; + st->cb = NULL; + /* + * Make sure the above stores are visible before should_run becomes + * true. Paired with the mb() above in cpuhp_thread_fun() + */ + smp_mb(); + st->should_run = true; + wake_up_process(st->thread); +} + +static int cpuhp_kick_ap_work(unsigned int cpu) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state state = st->state; + + trace_cpuhp_enter(cpu, st->target, state, cpuhp_kick_ap_work); + __cpuhp_kick_ap_work(st); + wait_for_completion(&st->done); + trace_cpuhp_exit(cpu, st->state, state, st->result); + return st->result; +} + +static struct smp_hotplug_thread cpuhp_threads = { + .store = &cpuhp_state.thread, + .create = &cpuhp_create, + .thread_should_run = cpuhp_should_run, + .thread_fn = cpuhp_thread_fun, + .thread_comm = "cpuhp/%u", + .selfparking = true, +}; + +void __init cpuhp_threads_init(void) +{ + BUG_ON(smpboot_register_percpu_thread(&cpuhp_threads)); + kthread_unpark(this_cpu_read(cpuhp_state.thread)); +} + +#ifdef CONFIG_HOTPLUG_CPU EXPORT_SYMBOL(register_cpu_notifier); EXPORT_SYMBOL(__register_cpu_notifier); - void unregister_cpu_notifier(struct notifier_block *nb) { cpu_maps_update_begin(); @@ -311,57 +636,60 @@ static inline void check_for_tasks(int dead_cpu) read_unlock(&tasklist_lock); } -struct take_cpu_down_param { - unsigned long mod; - void *hcpu; -}; +static void cpu_notify_nofail(unsigned long val, unsigned int cpu) +{ + BUG_ON(cpu_notify(val, cpu)); +} + +static int notify_down_prepare(unsigned int cpu) +{ + int err, nr_calls = 0; + + err = __cpu_notify(CPU_DOWN_PREPARE, cpu, -1, &nr_calls); + if (err) { + nr_calls--; + __cpu_notify(CPU_DOWN_FAILED, cpu, nr_calls, NULL); + pr_warn("%s: attempt to take down CPU %u failed\n", + __func__, cpu); + } + return err; +} + +static int notify_dying(unsigned int cpu) +{ + cpu_notify(CPU_DYING, cpu); + return 0; +} /* Take this CPU down. */ static int take_cpu_down(void *_param) { - struct take_cpu_down_param *param = _param; - int err; + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + enum cpuhp_state target = max((int)st->target, CPUHP_AP_OFFLINE); + int err, cpu = smp_processor_id(); /* Ensure this CPU doesn't handle any more interrupts. */ err = __cpu_disable(); if (err < 0) return err; - cpu_notify(CPU_DYING | param->mod, param->hcpu); + /* Invoke the former CPU_DYING callbacks */ + for (; st->state > target; st->state--) { + struct cpuhp_step *step = cpuhp_ap_states + st->state; + + cpuhp_invoke_callback(cpu, st->state, step->teardown); + } /* Give up timekeeping duties */ tick_handover_do_timer(); /* Park the stopper thread */ - stop_machine_park((long)param->hcpu); + stop_machine_park(cpu); return 0; } -/* Requires cpu_add_remove_lock to be held */ -static int _cpu_down(unsigned int cpu, int tasks_frozen) +static int takedown_cpu(unsigned int cpu) { - int err, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; - struct take_cpu_down_param tcd_param = { - .mod = mod, - .hcpu = hcpu, - }; - - if (num_online_cpus() == 1) - return -EBUSY; - - if (!cpu_online(cpu)) - return -EINVAL; - - cpu_hotplug_begin(); - - err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - pr_warn("%s: attempt to take down CPU %u failed\n", - __func__, cpu); - goto out_release; - } + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int err; /* * By now we've cleared cpu_active_mask, wait for all preempt-disabled @@ -378,6 +706,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) else synchronize_rcu(); + /* Park the smpboot threads */ + kthread_park(per_cpu_ptr(&cpuhp_state, cpu)->thread); smpboot_park_threads(cpu); /* @@ -389,12 +719,12 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* * So now all preempt/rcu users must observe !cpu_active(). */ - err = stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = stop_machine(take_cpu_down, NULL, cpumask_of(cpu)); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ - cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + cpu_notify_nofail(CPU_DOWN_FAILED, cpu); irq_unlock_sparse(); - goto out_release; + return err; } BUG_ON(cpu_online(cpu)); @@ -405,10 +735,8 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) * * Wait for the stop thread to go away. */ - while (!per_cpu(cpu_dead_idle, cpu)) - cpu_relax(); - smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */ - per_cpu(cpu_dead_idle, cpu) = false; + wait_for_completion(&st->done); + BUG_ON(st->state != CPUHP_AP_IDLE_DEAD); /* Interrupts are moved away from the dying cpu, reenable alloc/free */ irq_unlock_sparse(); @@ -417,20 +745,104 @@ static int _cpu_down(unsigned int cpu, int tasks_frozen) /* This actually kills the CPU. */ __cpu_die(cpu); - /* CPU is completely dead: tell everyone. Too late to complain. */ tick_cleanup_dead_cpu(cpu); - cpu_notify_nofail(CPU_DEAD | mod, hcpu); + return 0; +} +static int notify_dead(unsigned int cpu) +{ + cpu_notify_nofail(CPU_DEAD, cpu); check_for_tasks(cpu); + return 0; +} + +static void cpuhp_complete_idle_dead(void *arg) +{ + struct cpuhp_cpu_state *st = arg; + + complete(&st->done); +} + +void cpuhp_report_idle_dead(void) +{ + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + + BUG_ON(st->state != CPUHP_AP_OFFLINE); + rcu_report_dead(smp_processor_id()); + st->state = CPUHP_AP_IDLE_DEAD; + /* + * We cannot call complete after rcu_report_dead() so we delegate it + * to an online cpu. + */ + smp_call_function_single(cpumask_first(cpu_online_mask), + cpuhp_complete_idle_dead, st, 0); +} + +#else +#define notify_down_prepare NULL +#define takedown_cpu NULL +#define notify_dead NULL +#define notify_dying NULL +#endif + +#ifdef CONFIG_HOTPLUG_CPU + +/* Requires cpu_add_remove_lock to be held */ +static int __ref _cpu_down(unsigned int cpu, int tasks_frozen, + enum cpuhp_state target) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int prev_state, ret = 0; + bool hasdied = false; + + if (num_online_cpus() == 1) + return -EBUSY; + + if (!cpu_present(cpu)) + return -EINVAL; + + cpu_hotplug_begin(); -out_release: + cpuhp_tasks_frozen = tasks_frozen; + + prev_state = st->state; + st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread. + */ + if (st->state > CPUHP_TEARDOWN_CPU) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + + /* + * We might have stopped still in the range of the AP hotplug + * thread. Nothing to do anymore. + */ + if (st->state > CPUHP_TEARDOWN_CPU) + goto out; + } + /* + * The AP brought itself down to CPUHP_TEARDOWN_CPU. So we need + * to do the further cleanups. + */ + ret = cpuhp_down_callbacks(cpu, st, cpuhp_bp_states, target); + + hasdied = prev_state != st->state && st->state == CPUHP_OFFLINE; +out: cpu_hotplug_done(); - if (!err) - cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); - return err; + /* This post dead nonsense must die */ + if (!ret && hasdied) + cpu_notify_nofail(CPU_POST_DEAD, cpu); + return ret; } -int cpu_down(unsigned int cpu) +static int do_cpu_down(unsigned int cpu, enum cpuhp_state target) { int err; @@ -441,100 +853,131 @@ int cpu_down(unsigned int cpu) goto out; } - err = _cpu_down(cpu, 0); + err = _cpu_down(cpu, 0, target); out: cpu_maps_update_done(); return err; } +int cpu_down(unsigned int cpu) +{ + return do_cpu_down(cpu, CPUHP_OFFLINE); +} EXPORT_SYMBOL(cpu_down); #endif /*CONFIG_HOTPLUG_CPU*/ -/* - * Unpark per-CPU smpboot kthreads at CPU-online time. +/** + * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers + * @cpu: cpu that just started + * + * This function calls the cpu_chain notifiers with CPU_STARTING. + * It must be called by the arch code on the new cpu, before the new cpu + * enables interrupts and before the "boot" cpu returns from __cpu_up(). */ -static int smpboot_thread_call(struct notifier_block *nfb, - unsigned long action, void *hcpu) +void notify_cpu_starting(unsigned int cpu) { - int cpu = (long)hcpu; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + enum cpuhp_state target = min((int)st->target, CPUHP_AP_ONLINE); - switch (action & ~CPU_TASKS_FROZEN) { + while (st->state < target) { + struct cpuhp_step *step; - case CPU_DOWN_FAILED: - case CPU_ONLINE: - smpboot_unpark_threads(cpu); - break; - - default: - break; + st->state++; + step = cpuhp_ap_states + st->state; + cpuhp_invoke_callback(cpu, st->state, step->startup); } - - return NOTIFY_OK; } -static struct notifier_block smpboot_thread_notifier = { - .notifier_call = smpboot_thread_call, - .priority = CPU_PRI_SMPBOOT, -}; - -void smpboot_thread_init(void) +/* + * Called from the idle task. We need to set active here, so we can kick off + * the stopper thread and unpark the smpboot threads. If the target state is + * beyond CPUHP_AP_ONLINE_IDLE we kick cpuhp thread and let it bring up the + * cpu further. + */ +void cpuhp_online_idle(enum cpuhp_state state) { - register_cpu_notifier(&smpboot_thread_notifier); + struct cpuhp_cpu_state *st = this_cpu_ptr(&cpuhp_state); + unsigned int cpu = smp_processor_id(); + + /* Happens for the boot cpu */ + if (state != CPUHP_AP_ONLINE_IDLE) + return; + + st->state = CPUHP_AP_ONLINE_IDLE; + + /* The cpu is marked online, set it active now */ + set_cpu_active(cpu, true); + /* Unpark the stopper thread and the hotplug thread of this cpu */ + stop_machine_unpark(cpu); + kthread_unpark(st->thread); + + /* Should we go further up ? */ + if (st->target > CPUHP_AP_ONLINE_IDLE) + __cpuhp_kick_ap_work(st); + else + complete(&st->done); } /* Requires cpu_add_remove_lock to be held */ -static int _cpu_up(unsigned int cpu, int tasks_frozen) +static int _cpu_up(unsigned int cpu, int tasks_frozen, enum cpuhp_state target) { - int ret, nr_calls = 0; - void *hcpu = (void *)(long)cpu; - unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); struct task_struct *idle; + int ret = 0; cpu_hotplug_begin(); - if (cpu_online(cpu) || !cpu_present(cpu)) { + if (!cpu_present(cpu)) { ret = -EINVAL; goto out; } - idle = idle_thread_get(cpu); - if (IS_ERR(idle)) { - ret = PTR_ERR(idle); - goto out; - } - - ret = smpboot_create_threads(cpu); - if (ret) + /* + * The caller of do_cpu_up might have raced with another + * caller. Ignore it for now. + */ + if (st->state >= target) goto out; - ret = __cpu_notify(CPU_UP_PREPARE | mod, hcpu, -1, &nr_calls); - if (ret) { - nr_calls--; - pr_warn("%s: attempt to bring up CPU %u failed\n", - __func__, cpu); - goto out_notify; + if (st->state == CPUHP_OFFLINE) { + /* Let it fail before we try to bring the cpu up */ + idle = idle_thread_get(cpu); + if (IS_ERR(idle)) { + ret = PTR_ERR(idle); + goto out; + } } - /* Arch-specific enabling code. */ - ret = __cpu_up(cpu, idle); - - if (ret != 0) - goto out_notify; - BUG_ON(!cpu_online(cpu)); + cpuhp_tasks_frozen = tasks_frozen; - /* Now call notifier in preparation. */ - cpu_notify(CPU_ONLINE | mod, hcpu); + st->target = target; + /* + * If the current CPU state is in the range of the AP hotplug thread, + * then we need to kick the thread once more. + */ + if (st->state > CPUHP_BRINGUP_CPU) { + ret = cpuhp_kick_ap_work(cpu); + /* + * The AP side has done the error rollback already. Just + * return the error code.. + */ + if (ret) + goto out; + } -out_notify: - if (ret != 0) - __cpu_notify(CPU_UP_CANCELED | mod, hcpu, nr_calls, NULL); + /* + * Try to reach the target state. We max out on the BP at + * CPUHP_BRINGUP_CPU. After that the AP hotplug thread is + * responsible for bringing it up to the target state. + */ + target = min((int)target, CPUHP_BRINGUP_CPU); + ret = cpuhp_up_callbacks(cpu, st, cpuhp_bp_states, target); out: cpu_hotplug_done(); - return ret; } -int cpu_up(unsigned int cpu) +static int do_cpu_up(unsigned int cpu, enum cpuhp_state target) { int err = 0; @@ -558,12 +1001,16 @@ int cpu_up(unsigned int cpu) goto out; } - err = _cpu_up(cpu, 0); - + err = _cpu_up(cpu, 0, target); out: cpu_maps_update_done(); return err; } + +int cpu_up(unsigned int cpu) +{ + return do_cpu_up(cpu, CPUHP_ONLINE); +} EXPORT_SYMBOL_GPL(cpu_up); #ifdef CONFIG_PM_SLEEP_SMP @@ -586,7 +1033,7 @@ int disable_nonboot_cpus(void) if (cpu == first_cpu) continue; trace_suspend_resume(TPS("CPU_OFF"), cpu, true); - error = _cpu_down(cpu, 1); + error = _cpu_down(cpu, 1, CPUHP_OFFLINE); trace_suspend_resume(TPS("CPU_OFF"), cpu, false); if (!error) cpumask_set_cpu(cpu, frozen_cpus); @@ -636,7 +1083,7 @@ void enable_nonboot_cpus(void) for_each_cpu(cpu, frozen_cpus) { trace_suspend_resume(TPS("CPU_ON"), cpu, true); - error = _cpu_up(cpu, 1); + error = _cpu_up(cpu, 1, CPUHP_ONLINE); trace_suspend_resume(TPS("CPU_ON"), cpu, false); if (!error) { pr_info("CPU%d is up\n", cpu); @@ -709,26 +1156,463 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ +#endif /* CONFIG_SMP */ + +/* Boot processor state steps */ +static struct cpuhp_step cpuhp_bp_states[] = { + [CPUHP_OFFLINE] = { + .name = "offline", + .startup = NULL, + .teardown = NULL, + }, +#ifdef CONFIG_SMP + [CPUHP_CREATE_THREADS]= { + .name = "threads:create", + .startup = smpboot_create_threads, + .teardown = NULL, + .cant_stop = true, + }, + /* + * Preparatory and dead notifiers. Will be replaced once the notifiers + * are converted to states. + */ + [CPUHP_NOTIFY_PREPARE] = { + .name = "notify:prepare", + .startup = notify_prepare, + .teardown = notify_dead, + .skip_onerr = true, + .cant_stop = true, + }, + /* Kicks the plugged cpu into life */ + [CPUHP_BRINGUP_CPU] = { + .name = "cpu:bringup", + .startup = bringup_cpu, + .teardown = NULL, + .cant_stop = true, + }, + /* + * Handled on controll processor until the plugged processor manages + * this itself. + */ + [CPUHP_TEARDOWN_CPU] = { + .name = "cpu:teardown", + .startup = NULL, + .teardown = takedown_cpu, + .cant_stop = true, + }, +#endif +}; + +/* Application processor state steps */ +static struct cpuhp_step cpuhp_ap_states[] = { +#ifdef CONFIG_SMP + /* Final state before CPU kills itself */ + [CPUHP_AP_IDLE_DEAD] = { + .name = "idle:dead", + }, + /* + * Last state before CPU enters the idle loop to die. Transient state + * for synchronization. + */ + [CPUHP_AP_OFFLINE] = { + .name = "ap:offline", + .cant_stop = true, + }, + /* + * Low level startup/teardown notifiers. Run with interrupts + * disabled. Will be removed once the notifiers are converted to + * states. + */ + [CPUHP_AP_NOTIFY_STARTING] = { + .name = "notify:starting", + .startup = notify_starting, + .teardown = notify_dying, + .skip_onerr = true, + .cant_stop = true, + }, + /* Entry state on starting. Interrupts enabled from here on. Transient + * state for synchronsization */ + [CPUHP_AP_ONLINE] = { + .name = "ap:online", + }, + /* Handle smpboot threads park/unpark */ + [CPUHP_AP_SMPBOOT_THREADS] = { + .name = "smpboot:threads", + .startup = smpboot_unpark_threads, + .teardown = NULL, + }, + /* + * Online/down_prepare notifiers. Will be removed once the notifiers + * are converted to states. + */ + [CPUHP_AP_NOTIFY_ONLINE] = { + .name = "notify:online", + .startup = notify_online, + .teardown = notify_down_prepare, + }, +#endif + /* + * The dynamically registered state space is here + */ + + /* CPU is fully up and running. */ + [CPUHP_ONLINE] = { + .name = "online", + .startup = NULL, + .teardown = NULL, + }, +}; + +/* Sanity check for callbacks */ +static int cpuhp_cb_check(enum cpuhp_state state) +{ + if (state <= CPUHP_OFFLINE || state >= CPUHP_ONLINE) + return -EINVAL; + return 0; +} + +static bool cpuhp_is_ap_state(enum cpuhp_state state) +{ + /* + * The extra check for CPUHP_TEARDOWN_CPU is only for documentation + * purposes as that state is handled explicitely in cpu_down. + */ + return state > CPUHP_BRINGUP_CPU && state != CPUHP_TEARDOWN_CPU; +} + +static struct cpuhp_step *cpuhp_get_step(enum cpuhp_state state) +{ + struct cpuhp_step *sp; + + sp = cpuhp_is_ap_state(state) ? cpuhp_ap_states : cpuhp_bp_states; + return sp + state; +} + +static void cpuhp_store_callbacks(enum cpuhp_state state, + const char *name, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) +{ + /* (Un)Install the callbacks for further cpu hotplug operations */ + struct cpuhp_step *sp; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(state); + sp->startup = startup; + sp->teardown = teardown; + sp->name = name; + mutex_unlock(&cpuhp_state_mutex); +} + +static void *cpuhp_get_teardown_cb(enum cpuhp_state state) +{ + return cpuhp_get_step(state)->teardown; +} + +/* + * Call the startup/teardown function for a step either on the AP or + * on the current CPU. + */ +static int cpuhp_issue_call(int cpu, enum cpuhp_state state, + int (*cb)(unsigned int), bool bringup) +{ + int ret; + + if (!cb) + return 0; + /* + * The non AP bound callbacks can fail on bringup. On teardown + * e.g. module removal we crash for now. + */ +#ifdef CONFIG_SMP + if (cpuhp_is_ap_state(state)) + ret = cpuhp_invoke_ap_callback(cpu, state, cb); + else + ret = cpuhp_invoke_callback(cpu, state, cb); +#else + ret = cpuhp_invoke_callback(cpu, state, cb); +#endif + BUG_ON(ret && !bringup); + return ret; +} + +/* + * Called from __cpuhp_setup_state on a recoverable failure. + * + * Note: The teardown callbacks for rollback are not allowed to fail! + */ +static void cpuhp_rollback_install(int failedcpu, enum cpuhp_state state, + int (*teardown)(unsigned int cpu)) +{ + int cpu; + + if (!teardown) + return; + + /* Roll back the already executed steps on the other cpus */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpu >= failedcpu) + break; + + /* Did we invoke the startup call on that cpu ? */ + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +} + +/* + * Returns a free for dynamic slot assignment of the Online state. The states + * are protected by the cpuhp_slot_states mutex and an empty slot is identified + * by having no name assigned. + */ +static int cpuhp_reserve_state(enum cpuhp_state state) +{ + enum cpuhp_state i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_AP_ONLINE_DYN; i <= CPUHP_AP_ONLINE_DYN_END; i++) { + if (cpuhp_ap_states[i].name) + continue; + + cpuhp_ap_states[i].name = "Reserved"; + mutex_unlock(&cpuhp_state_mutex); + return i; + } + mutex_unlock(&cpuhp_state_mutex); + WARN(1, "No more dynamic states available for CPU hotplug\n"); + return -ENOSPC; +} + /** - * notify_cpu_starting(cpu) - call the CPU_STARTING notifiers - * @cpu: cpu that just started + * __cpuhp_setup_state - Setup the callbacks for an hotplug machine state + * @state: The state to setup + * @invoke: If true, the startup function is invoked for cpus where + * cpu state >= @state + * @startup: startup callback function + * @teardown: teardown callback function * - * This function calls the cpu_chain notifiers with CPU_STARTING. - * It must be called by the arch code on the new cpu, before the new cpu - * enables interrupts and before the "boot" cpu returns from __cpu_up(). + * Returns 0 if successful, otherwise a proper error code */ -void notify_cpu_starting(unsigned int cpu) +int __cpuhp_setup_state(enum cpuhp_state state, + const char *name, bool invoke, + int (*startup)(unsigned int cpu), + int (*teardown)(unsigned int cpu)) { - unsigned long val = CPU_STARTING; + int cpu, ret = 0; + int dyn_state = 0; -#ifdef CONFIG_PM_SLEEP_SMP - if (frozen_cpus != NULL && cpumask_test_cpu(cpu, frozen_cpus)) - val = CPU_STARTING_FROZEN; -#endif /* CONFIG_PM_SLEEP_SMP */ - cpu_notify(val, (void *)(long)cpu); + if (cpuhp_cb_check(state) || !name) + return -EINVAL; + + get_online_cpus(); + + /* currently assignments for the ONLINE state are possible */ + if (state == CPUHP_AP_ONLINE_DYN) { + dyn_state = 1; + ret = cpuhp_reserve_state(state); + if (ret < 0) + goto out; + state = ret; + } + + cpuhp_store_callbacks(state, name, startup, teardown); + + if (!invoke || !startup) + goto out; + + /* + * Try to call the startup callback for each present cpu + * depending on the hotplug state of the cpu. + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate < state) + continue; + + ret = cpuhp_issue_call(cpu, state, startup, true); + if (ret) { + cpuhp_rollback_install(cpu, state, teardown); + cpuhp_store_callbacks(state, NULL, NULL, NULL); + goto out; + } + } +out: + put_online_cpus(); + if (!ret && dyn_state) + return state; + return ret; } +EXPORT_SYMBOL(__cpuhp_setup_state); -#endif /* CONFIG_SMP */ +/** + * __cpuhp_remove_state - Remove the callbacks for an hotplug machine state + * @state: The state to remove + * @invoke: If true, the teardown function is invoked for cpus where + * cpu state >= @state + * + * The teardown callback is currently not allowed to fail. Think + * about module removal! + */ +void __cpuhp_remove_state(enum cpuhp_state state, bool invoke) +{ + int (*teardown)(unsigned int cpu) = cpuhp_get_teardown_cb(state); + int cpu; + + BUG_ON(cpuhp_cb_check(state)); + + get_online_cpus(); + + if (!invoke || !teardown) + goto remove; + + /* + * Call the teardown callback for each present cpu depending + * on the hotplug state of the cpu. This function is not + * allowed to fail currently! + */ + for_each_present_cpu(cpu) { + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, cpu); + int cpustate = st->state; + + if (cpustate >= state) + cpuhp_issue_call(cpu, state, teardown, false); + } +remove: + cpuhp_store_callbacks(state, NULL, NULL, NULL); + put_online_cpus(); +} +EXPORT_SYMBOL(__cpuhp_remove_state); + +#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU) +static ssize_t show_cpuhp_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->state); +} +static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL); + +static ssize_t write_cpuhp_target(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + struct cpuhp_step *sp; + int target, ret; + + ret = kstrtoint(buf, 10, &target); + if (ret) + return ret; + +#ifdef CONFIG_CPU_HOTPLUG_STATE_CONTROL + if (target < CPUHP_OFFLINE || target > CPUHP_ONLINE) + return -EINVAL; +#else + if (target != CPUHP_OFFLINE && target != CPUHP_ONLINE) + return -EINVAL; +#endif + + ret = lock_device_hotplug_sysfs(); + if (ret) + return ret; + + mutex_lock(&cpuhp_state_mutex); + sp = cpuhp_get_step(target); + ret = !sp->name || sp->cant_stop ? -EINVAL : 0; + mutex_unlock(&cpuhp_state_mutex); + if (ret) + return ret; + + if (st->state < target) + ret = do_cpu_up(dev->id, target); + else + ret = do_cpu_down(dev->id, target); + + unlock_device_hotplug(); + return ret ? ret : count; +} + +static ssize_t show_cpuhp_target(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id); + + return sprintf(buf, "%d\n", st->target); +} +static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target); + +static struct attribute *cpuhp_cpu_attrs[] = { + &dev_attr_state.attr, + &dev_attr_target.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_attr_group = { + .attrs = cpuhp_cpu_attrs, + .name = "hotplug", + NULL +}; + +static ssize_t show_cpuhp_states(struct device *dev, + struct device_attribute *attr, char *buf) +{ + ssize_t cur, res = 0; + int i; + + mutex_lock(&cpuhp_state_mutex); + for (i = CPUHP_OFFLINE; i <= CPUHP_ONLINE; i++) { + struct cpuhp_step *sp = cpuhp_get_step(i); + + if (sp->name) { + cur = sprintf(buf, "%3d: %s\n", i, sp->name); + buf += cur; + res += cur; + } + } + mutex_unlock(&cpuhp_state_mutex); + return res; +} +static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL); + +static struct attribute *cpuhp_cpu_root_attrs[] = { + &dev_attr_states.attr, + NULL +}; + +static struct attribute_group cpuhp_cpu_root_attr_group = { + .attrs = cpuhp_cpu_root_attrs, + .name = "hotplug", + NULL +}; + +static int __init cpuhp_sysfs_init(void) +{ + int cpu, ret; + + ret = sysfs_create_group(&cpu_subsys.dev_root->kobj, + &cpuhp_cpu_root_attr_group); + if (ret) + return ret; + + for_each_possible_cpu(cpu) { + struct device *dev = get_cpu_device(cpu); + + if (!dev) + continue; + ret = sysfs_create_group(&dev->kobj, &cpuhp_cpu_attr_group); + if (ret) + return ret; + } + return 0; +} +device_initcall(cpuhp_sysfs_init); +#endif /* * cpu_bit_bitmap[] is a special, "compressed" data structure that @@ -759,71 +1643,55 @@ const DECLARE_BITMAP(cpu_all_bits, NR_CPUS) = CPU_BITS_ALL; EXPORT_SYMBOL(cpu_all_bits); #ifdef CONFIG_INIT_ALL_POSSIBLE -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly - = CPU_BITS_ALL; +struct cpumask __cpu_possible_mask __read_mostly + = {CPU_BITS_ALL}; #else -static DECLARE_BITMAP(cpu_possible_bits, CONFIG_NR_CPUS) __read_mostly; +struct cpumask __cpu_possible_mask __read_mostly; #endif -const struct cpumask *const cpu_possible_mask = to_cpumask(cpu_possible_bits); -EXPORT_SYMBOL(cpu_possible_mask); +EXPORT_SYMBOL(__cpu_possible_mask); -static DECLARE_BITMAP(cpu_online_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_online_mask = to_cpumask(cpu_online_bits); -EXPORT_SYMBOL(cpu_online_mask); +struct cpumask __cpu_online_mask __read_mostly; +EXPORT_SYMBOL(__cpu_online_mask); -static DECLARE_BITMAP(cpu_present_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_present_mask = to_cpumask(cpu_present_bits); -EXPORT_SYMBOL(cpu_present_mask); +struct cpumask __cpu_present_mask __read_mostly; +EXPORT_SYMBOL(__cpu_present_mask); -static DECLARE_BITMAP(cpu_active_bits, CONFIG_NR_CPUS) __read_mostly; -const struct cpumask *const cpu_active_mask = to_cpumask(cpu_active_bits); -EXPORT_SYMBOL(cpu_active_mask); +struct cpumask __cpu_active_mask __read_mostly; +EXPORT_SYMBOL(__cpu_active_mask); -void set_cpu_possible(unsigned int cpu, bool possible) -{ - if (possible) - cpumask_set_cpu(cpu, to_cpumask(cpu_possible_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_possible_bits)); -} - -void set_cpu_present(unsigned int cpu, bool present) +void init_cpu_present(const struct cpumask *src) { - if (present) - cpumask_set_cpu(cpu, to_cpumask(cpu_present_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_present_bits)); + cpumask_copy(&__cpu_present_mask, src); } -void set_cpu_online(unsigned int cpu, bool online) +void init_cpu_possible(const struct cpumask *src) { - if (online) { - cpumask_set_cpu(cpu, to_cpumask(cpu_online_bits)); - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); - } else { - cpumask_clear_cpu(cpu, to_cpumask(cpu_online_bits)); - } + cpumask_copy(&__cpu_possible_mask, src); } -void set_cpu_active(unsigned int cpu, bool active) +void init_cpu_online(const struct cpumask *src) { - if (active) - cpumask_set_cpu(cpu, to_cpumask(cpu_active_bits)); - else - cpumask_clear_cpu(cpu, to_cpumask(cpu_active_bits)); + cpumask_copy(&__cpu_online_mask, src); } -void init_cpu_present(const struct cpumask *src) +/* + * Activate the first processor. + */ +void __init boot_cpu_init(void) { - cpumask_copy(to_cpumask(cpu_present_bits), src); -} + int cpu = smp_processor_id(); -void init_cpu_possible(const struct cpumask *src) -{ - cpumask_copy(to_cpumask(cpu_possible_bits), src); + /* Mark the boot cpu "present", "online" etc for SMP and UP case */ + set_cpu_online(cpu, true); + set_cpu_active(cpu, true); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); } -void init_cpu_online(const struct cpumask *src) +/* + * Must be called _AFTER_ setting up the per_cpu areas + */ +void __init boot_cpu_state_init(void) { - cpumask_copy(to_cpumask(cpu_online_bits), src); + per_cpu_ptr(&cpuhp_state, smp_processor_id())->state = CPUHP_ONLINE; } diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 10ae73611d80..00ab5c2b7c5b 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -51,6 +51,7 @@ #include <linux/stat.h> #include <linux/string.h> #include <linux/time.h> +#include <linux/time64.h> #include <linux/backing-dev.h> #include <linux/sort.h> @@ -68,7 +69,7 @@ struct static_key cpusets_enabled_key __read_mostly = STATIC_KEY_INIT_FALSE; struct fmeter { int cnt; /* unprocessed events count */ int val; /* most recent output value */ - time_t time; /* clock (secs) when val computed */ + time64_t time; /* clock (secs) when val computed */ spinlock_t lock; /* guards read or write of above */ }; @@ -286,6 +287,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -971,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; + struct cpuset_migrate_mm_work *mwork; - tsk->mems_allowed = *to; - - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +void cpuset_post_attach_flush(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1096,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1374,7 +1398,7 @@ out: */ #define FM_COEF 933 /* coefficient for half-life of 10 secs */ -#define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */ +#define FM_MAXTICKS ((u32)99) /* useless computing more ticks than this */ #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */ #define FM_SCALE 1000 /* faux fixed point scale */ @@ -1390,8 +1414,11 @@ static void fmeter_init(struct fmeter *fmp) /* Internal meter update - process cnt events and update value */ static void fmeter_update(struct fmeter *fmp) { - time_t now = get_seconds(); - time_t ticks = now - fmp->time; + time64_t now; + u32 ticks; + + now = ktime_get_seconds(); + ticks = now - fmp->time; if (ticks == 0) return; @@ -1429,15 +1456,16 @@ static int fmeter_getrate(struct fmeter *fmp) static struct cpuset *cpuset_attach_old_cs; /* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */ -static int cpuset_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpuset_can_attach(struct cgroup_taskset *tset) { - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct task_struct *task; int ret; /* used later by cpuset_attach() */ - cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset)); + cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css)); + cs = css_cs(css); mutex_lock(&cpuset_mutex); @@ -1447,7 +1475,7 @@ static int cpuset_can_attach(struct cgroup_subsys_state *css, (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))) goto out_unlock; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { ret = task_can_attach(task, cs->cpus_allowed); if (ret) goto out_unlock; @@ -1467,9 +1495,14 @@ out_unlock: return ret; } -static void cpuset_cancel_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_cancel_attach(struct cgroup_taskset *tset) { + struct cgroup_subsys_state *css; + struct cpuset *cs; + + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); css_cs(css)->attach_in_progress--; mutex_unlock(&cpuset_mutex); @@ -1482,16 +1515,19 @@ static void cpuset_cancel_attach(struct cgroup_subsys_state *css, */ static cpumask_var_t cpus_attach; -static void cpuset_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpuset_attach(struct cgroup_taskset *tset) { /* static buf protected by cpuset_mutex */ static nodemask_t cpuset_attach_nodemask_to; struct task_struct *task; struct task_struct *leader; - struct cpuset *cs = css_cs(css); + struct cgroup_subsys_state *css; + struct cpuset *cs; struct cpuset *oldcs = cpuset_attach_old_cs; + cgroup_taskset_first(tset, &css); + cs = css_cs(css); + mutex_lock(&cpuset_mutex); /* prepare for attach */ @@ -1502,7 +1538,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, guarantee_online_mems(cs, &cpuset_attach_nodemask_to); - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { /* * can_attach beforehand should guarantee that this doesn't * fail. TODO: have a better way to handle failure here @@ -1518,7 +1554,7 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * sleep and should be moved outside migration path proper. */ cpuset_attach_nodemask_to = cs->effective_mems; - cgroup_taskset_for_each_leader(leader, tset) { + cgroup_taskset_for_each_leader(leader, css, tset) { struct mm_struct *mm = get_task_mm(leader); if (mm) { @@ -1532,11 +1568,11 @@ static void cpuset_attach(struct cgroup_subsys_state *css, * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) { + if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - } - mmput(mm); + else + mmput(mm); } } @@ -1701,6 +1737,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -2052,7 +2089,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .bind = cpuset_bind, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; /** @@ -2346,6 +2383,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /** @@ -2674,10 +2714,10 @@ int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns, goto out; retval = -ENAMETOOLONG; - rcu_read_lock(); - css = task_css(tsk, cpuset_cgrp_id); - p = cgroup_path(css->cgroup, buf, PATH_MAX); - rcu_read_unlock(); + css = task_get_css(tsk, cpuset_cgrp_id); + p = cgroup_path_ns(css->cgroup, buf, PATH_MAX, + current->nsproxy->cgroup_ns); + css_put(css); if (!p) goto out_free; seq_puts(m, p); diff --git a/kernel/cred.c b/kernel/cred.c index 71179a09c1d6..0c0cd8a62285 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -569,8 +569,8 @@ EXPORT_SYMBOL(revert_creds); void __init cred_init(void) { /* allocate a slab in which we can store credentials */ - cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + cred_jar = kmem_cache_create("cred_jar", sizeof(struct cred), 0, + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT, NULL); } /** diff --git a/kernel/debug/kdb/kdb_bp.c b/kernel/debug/kdb/kdb_bp.c index e1dbf4a2c69e..90ff129c88a2 100644 --- a/kernel/debug/kdb/kdb_bp.c +++ b/kernel/debug/kdb/kdb_bp.c @@ -153,13 +153,11 @@ static int _kdb_bp_install(struct pt_regs *regs, kdb_bp_t *bp) } else { kdb_printf("%s: failed to set breakpoint at 0x%lx\n", __func__, bp->bp_addr); -#ifdef CONFIG_DEBUG_RODATA if (!bp->bp_type) { kdb_printf("Software breakpoints are unavailable.\n" - " Change the kernel CONFIG_DEBUG_RODATA=n\n" + " Boot the kernel with rodata=off\n" " OR use hw breaks: help bph\n"); } -#endif return 1; } return 0; diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4121345498e0..2a20c0dfdafc 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2021,7 +2021,7 @@ static int kdb_lsmod(int argc, const char **argv) continue; kdb_printf("%-20s%8u 0x%p ", mod->name, - mod->core_size, (void *)mod); + mod->core_layout.size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD kdb_printf("%4d ", module_refcount(mod)); #endif @@ -2031,7 +2031,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf(" (Loading)"); else kdb_printf(" (Live)"); - kdb_printf(" 0x%p", mod->module_core); + kdb_printf(" 0x%p", mod->core_layout.base); #ifdef CONFIG_MODULE_UNLOAD { diff --git a/kernel/delayacct.c b/kernel/delayacct.c index ef90b04d783f..435c14a45118 100644 --- a/kernel/delayacct.c +++ b/kernel/delayacct.c @@ -34,7 +34,7 @@ __setup("nodelayacct", delayacct_setup_disable); void delayacct_init(void) { - delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC); + delayacct_cache = KMEM_CACHE(task_delay_info, SLAB_PANIC|SLAB_ACCOUNT); delayacct_tsk_init(&init_task); } diff --git a/kernel/events/callchain.c b/kernel/events/callchain.c index d659487254d5..343c22f5e867 100644 --- a/kernel/events/callchain.c +++ b/kernel/events/callchain.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING @@ -159,15 +159,24 @@ put_callchain_entry(int rctx) struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs) { - int rctx; - struct perf_callchain_entry *entry; - - int kernel = !event->attr.exclude_callchain_kernel; - int user = !event->attr.exclude_callchain_user; + bool kernel = !event->attr.exclude_callchain_kernel; + bool user = !event->attr.exclude_callchain_user; + /* Disallow cross-task user callchains. */ + bool crosstask = event->ctx->task && event->ctx->task != current; if (!kernel && !user) return NULL; + return get_perf_callchain(regs, 0, kernel, user, crosstask, true); +} + +struct perf_callchain_entry * +get_perf_callchain(struct pt_regs *regs, u32 init_nr, bool kernel, bool user, + bool crosstask, bool add_mark) +{ + struct perf_callchain_entry *entry; + int rctx; + entry = get_callchain_entry(&rctx); if (rctx == -1) return NULL; @@ -175,10 +184,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) if (!entry) goto exit_put; - entry->nr = 0; + entry->nr = init_nr; if (kernel && !user_mode(regs)) { - perf_callchain_store(entry, PERF_CONTEXT_KERNEL); + if (add_mark) + perf_callchain_store(entry, PERF_CONTEXT_KERNEL); perf_callchain_kernel(entry, regs); } @@ -191,13 +201,11 @@ perf_callchain(struct perf_event *event, struct pt_regs *regs) } if (regs) { - /* - * Disallow cross-task user callchains. - */ - if (event->ctx->task && event->ctx->task != current) + if (crosstask) goto exit_put; - perf_callchain_store(entry, PERF_CONTEXT_USER); + if (add_mark) + perf_callchain_store(entry, PERF_CONTEXT_USER); perf_callchain_user(entry, regs); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index 1a734e0adfa7..52bedc5a5aaa 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING @@ -49,8 +49,6 @@ #include <asm/irq_regs.h> -static struct workqueue_struct *perf_wq; - typedef int (*remote_function_f)(void *); struct remote_function_call { @@ -66,8 +64,17 @@ static void remote_function(void *data) struct task_struct *p = tfc->p; if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + /* -EAGAIN */ + if (task_cpu(p) != smp_processor_id()) + return; + + /* + * Now that we're on right CPU with IRQs disabled, we can test + * if we hit the right task without races. + */ + + tfc->ret = -ESRCH; /* No such (running) process */ + if (p != current) return; } @@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) .p = p, .func = func, .info = info, - .ret = -ESRCH, /* No such (running) process */ + .ret = -EAGAIN, }; + int ret; - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); + do { + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + if (!ret) + ret = data.ret; + } while (ret == -EAGAIN); - return data.ret; + return ret; } /** @@ -126,11 +137,168 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -#define EVENT_OWNER_KERNEL ((void *) -1) +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + +#define TASK_TOMBSTONE ((void *)-1L) static bool is_kernel_event(struct perf_event *event) { - return event->owner == EVENT_OWNER_KERNEL; + return READ_ONCE(event->owner) == TASK_TOMBSTONE; +} + +/* + * On task ctx scheduling... + * + * When !ctx->nr_events a task context will not be scheduled. This means + * we can disable the scheduler hooks (for performance) without leaving + * pending task ctx state. + * + * This however results in two special cases: + * + * - removing the last event from a task ctx; this is relatively straight + * forward and is done in __perf_remove_from_context. + * + * - adding the first event to a task ctx; this is tricky because we cannot + * rely on ctx->is_active and therefore cannot use event_function_call(). + * See perf_install_in_context(). + * + * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. + */ + +typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, + struct perf_event_context *, void *); + +struct event_function_struct { + struct perf_event *event; + event_f func; + void *data; +}; + +static int event_function(void *info) +{ + struct event_function_struct *efs = info; + struct perf_event *event = efs->event; + struct perf_event_context *ctx = event->ctx; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + int ret = 0; + + WARN_ON_ONCE(!irqs_disabled()); + + perf_ctx_lock(cpuctx, task_ctx); + /* + * Since we do the IPI call without holding ctx->lock things can have + * changed, double check we hit the task we set out to hit. + */ + if (ctx->task) { + if (ctx->task != current) { + ret = -ESRCH; + goto unlock; + } + + /* + * We only use event_function_call() on established contexts, + * and event_function() is only ever called when active (or + * rather, we'll have bailed in task_function_call() or the + * above ctx->task != current test), therefore we must have + * ctx->is_active here. + */ + WARN_ON_ONCE(!ctx->is_active); + /* + * And since we have ctx->is_active, cpuctx->task_ctx must + * match. + */ + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + efs->func(event, cpuctx, ctx, efs->data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); + + return ret; +} + +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + int ret = event_function(&efs); + WARN_ON_ONCE(ret); +} + +static void event_function_call(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + if (!event->parent) { + /* + * If this is a !child event, we must hold ctx::mutex to + * stabilize the the event->ctx relation. See + * perf_event_ctx_lock(). + */ + lockdep_assert_held(&ctx->mutex); + } + + if (!task) { + cpu_function_call(event->cpu, event_function, &efs); + return; + } + + if (task == TASK_TOMBSTONE) + return; + +again: + if (!task_function_call(task, event_function, &efs)) + return; + + raw_spin_lock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } + if (ctx->is_active) { + raw_spin_unlock_irq(&ctx->lock); + goto again; + } + func(event, NULL, ctx, data); + raw_spin_unlock_irq(&ctx->lock); } #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ @@ -148,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event) enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, + EVENT_TIME = 0x4, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -155,7 +324,13 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct static_key_deferred perf_sched_events __read_mostly; + +static void perf_sched_delayed(struct work_struct *work); +DEFINE_STATIC_KEY_FALSE(perf_sched_events); +static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); +static DEFINE_MUTEX(perf_sched_mutex); +static atomic_t perf_sched_count; + static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); @@ -201,8 +376,11 @@ static void update_perf_cpu_limits(void) u64 tmp = perf_sample_period_ns; tmp *= sysctl_perf_cpu_time_max_percent; - do_div(tmp, 100); - ACCESS_ONCE(perf_sample_allowed_ns) = tmp; + tmp = div_u64(tmp, 100); + if (!tmp) + tmp = 1; + + WRITE_ONCE(perf_sample_allowed_ns, tmp); } static int perf_rotate_context(struct perf_cpu_context *cpuctx); @@ -234,7 +412,13 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, if (ret || !write) return ret; - update_perf_cpu_limits(); + if (sysctl_perf_cpu_time_max_percent == 100) { + printk(KERN_WARNING + "perf: Dynamic interrupt throttling disabled, can hang your system!\n"); + WRITE_ONCE(perf_sample_allowed_ns, 0); + } else { + update_perf_cpu_limits(); + } return 0; } @@ -248,62 +432,68 @@ int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, #define NR_ACCUMULATED_SAMPLES 128 static DEFINE_PER_CPU(u64, running_sample_length); +static u64 __report_avg; +static u64 __report_allowed; + static void perf_duration_warn(struct irq_work *w) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; - - local_samples_len = __this_cpu_read(running_sample_length); - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - printk_ratelimited(KERN_WARNING - "perf interrupt took too long (%lld > %lld), lowering " - "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, - sysctl_perf_event_sample_rate); + "perf: interrupt took too long (%lld > %lld), lowering " + "kernel.perf_event_max_sample_rate to %d\n", + __report_avg, __report_allowed, + sysctl_perf_event_sample_rate); } static DEFINE_IRQ_WORK(perf_duration_work, perf_duration_warn); void perf_sample_event_took(u64 sample_len_ns) { - u64 allowed_ns = ACCESS_ONCE(perf_sample_allowed_ns); - u64 avg_local_sample_len; - u64 local_samples_len; + u64 max_len = READ_ONCE(perf_sample_allowed_ns); + u64 running_len; + u64 avg_len; + u32 max; - if (allowed_ns == 0) + if (max_len == 0) return; - /* decay the counter by 1 average sample */ - local_samples_len = __this_cpu_read(running_sample_length); - local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES; - local_samples_len += sample_len_ns; - __this_cpu_write(running_sample_length, local_samples_len); + /* Decay the counter by 1 average sample. */ + running_len = __this_cpu_read(running_sample_length); + running_len -= running_len/NR_ACCUMULATED_SAMPLES; + running_len += sample_len_ns; + __this_cpu_write(running_sample_length, running_len); /* - * note: this will be biased artifically low until we have - * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us + * Note: this will be biased artifically low until we have + * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us * from having to maintain a count. */ - avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES; - - if (avg_local_sample_len <= allowed_ns) + avg_len = running_len/NR_ACCUMULATED_SAMPLES; + if (avg_len <= max_len) return; - if (max_samples_per_tick <= 1) - return; + __report_avg = avg_len; + __report_allowed = max_len; - max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2); - sysctl_perf_event_sample_rate = max_samples_per_tick * HZ; - perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; + /* + * Compute a throttle threshold 25% below the current duration. + */ + avg_len += avg_len / 4; + max = (TICK_NSEC / 100) * sysctl_perf_cpu_time_max_percent; + if (avg_len < max) + max /= (u32)avg_len; + else + max = 1; - update_perf_cpu_limits(); + WRITE_ONCE(perf_sample_allowed_ns, avg_len); + WRITE_ONCE(max_samples_per_tick, max); + + sysctl_perf_event_sample_rate = max * HZ; + perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate; if (!irq_work_queue(&perf_duration_work)) { - early_printk("perf interrupt took too long (%lld > %lld), lowering " + early_printk("perf: interrupt took too long (%lld > %lld), lowering " "kernel.perf_event_max_sample_rate to %d\n", - avg_local_sample_len, allowed_ns >> 1, + __report_avg, __report_allowed, sysctl_perf_event_sample_rate); } } @@ -337,28 +527,6 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -static void perf_ctx_lock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - raw_spin_lock(&cpuctx->ctx.lock); - if (ctx) - raw_spin_lock(&ctx->lock); -} - -static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); -} - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -435,7 +603,7 @@ static inline void update_cgrp_time_from_event(struct perf_event *event) if (!is_cgroup_event(event)) return; - cgrp = perf_cgroup_from_task(current); + cgrp = perf_cgroup_from_task(current, event->ctx); /* * Do not update time when cgroup is not active */ @@ -458,7 +626,7 @@ perf_cgroup_set_timestamp(struct task_struct *task, if (!task || !ctx->nr_cgroups) return; - cgrp = perf_cgroup_from_task(task); + cgrp = perf_cgroup_from_task(task, ctx); info = this_cpu_ptr(cgrp->info); info->timestamp = ctx->timestamp; } @@ -489,7 +657,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * we reschedule only in the presence of cgroup * constrained events. */ - rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = this_cpu_ptr(pmu->pmu_cpu_context); @@ -522,8 +689,10 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) * set cgrp before ctxsw in to allow * event_filter_match() to not have to pass * task around + * we pass the cpuctx->ctx to perf_cgroup_from_task() + * because cgorup events are only per-cpu */ - cpuctx->cgrp = perf_cgroup_from_task(task); + cpuctx->cgrp = perf_cgroup_from_task(task, &cpuctx->ctx); cpu_ctx_sched_in(cpuctx, EVENT_ALL, task); } perf_pmu_enable(cpuctx->ctx.pmu); @@ -531,8 +700,6 @@ static void perf_cgroup_switch(struct task_struct *task, int mode) } } - rcu_read_unlock(); - local_irq_restore(flags); } @@ -542,17 +709,14 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); - - /* - * next is NULL when called from perf_event_enable_on_exec() - * that will systematically cause a cgroup_switch() - */ - if (next) - cgrp2 = perf_cgroup_from_task(next); + cgrp1 = perf_cgroup_from_task(task, NULL); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -561,6 +725,8 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWOUT); + + rcu_read_unlock(); } static inline void perf_cgroup_sched_in(struct task_struct *prev, @@ -569,13 +735,14 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, struct perf_cgroup *cgrp1; struct perf_cgroup *cgrp2 = NULL; + rcu_read_lock(); /* * we come here when we know perf_cgroup_events > 0 + * we do not need to pass the ctx here because we know + * we are holding the rcu lock */ - cgrp1 = perf_cgroup_from_task(task); - - /* prev can never be NULL */ - cgrp2 = perf_cgroup_from_task(prev); + cgrp1 = perf_cgroup_from_task(task, NULL); + cgrp2 = perf_cgroup_from_task(prev, NULL); /* * only need to schedule in cgroup events if we are changing @@ -584,6 +751,8 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, */ if (cgrp1 != cgrp2) perf_cgroup_switch(task, PERF_CGROUP_SWIN); + + rcu_read_unlock(); } static inline int perf_cgroup_connect(int fd, struct perf_event *event, @@ -877,7 +1046,7 @@ static void put_ctx(struct perf_event_context *ctx) if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - if (ctx->task) + if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } @@ -894,9 +1063,8 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] - * __perf_event_exit_task() - * sync_child_event() - * put_event() [ parent, 1 ] + * perf_event_exit_event() + * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() @@ -939,8 +1107,8 @@ static void put_ctx(struct perf_event_context *ctx) * Lock order: * task_struct::perf_event_mutex * perf_event_context::mutex - * perf_event_context::lock * perf_event::child_mutex; + * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem */ @@ -1038,6 +1206,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. + * * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ @@ -1050,13 +1219,13 @@ retry: /* * One of the few rules of preemptible RCU is that one cannot do * rcu_read_unlock() while holding a scheduler (or nested) lock when - * part of the read side critical section was preemptible -- see + * part of the read side critical section was irqs-enabled -- see * rcu_read_unlock_special(). * * Since ctx->lock nests under rq->lock we must ensure the entire read - * side critical section is non-preemptible. + * side critical section has interrupts disabled. */ - preempt_disable(); + local_irq_save(*flags); rcu_read_lock(); ctx = rcu_dereference(task->perf_event_ctxp[ctxn]); if (ctx) { @@ -1070,21 +1239,25 @@ retry: * if so. If we locked the right context, then it * can't get swapped on us any more. */ - raw_spin_lock_irqsave(&ctx->lock, *flags); + raw_spin_lock(&ctx->lock); if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); + raw_spin_unlock(&ctx->lock); rcu_read_unlock(); - preempt_enable(); + local_irq_restore(*flags); goto retry; } - if (!atomic_inc_not_zero(&ctx->refcount)) { - raw_spin_unlock_irqrestore(&ctx->lock, *flags); + if (ctx->task == TASK_TOMBSTONE || + !atomic_inc_not_zero(&ctx->refcount)) { + raw_spin_unlock(&ctx->lock); ctx = NULL; + } else { + WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); - preempt_enable(); + if (!ctx) + local_irq_restore(*flags); return ctx; } @@ -1139,16 +1312,18 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; u64 run_end; + lockdep_assert_held(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; + /* * in cgroup mode, time_enabled represents * the time the event was enabled AND active @@ -1205,6 +1380,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; @@ -1407,11 +1584,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) { ctx->nr_cgroups--; + /* + * Because cgroup events are always per-cpu events, this will + * always be called from the right CPU. + */ cpuctx = __get_cpu_context(ctx); /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() + * If there are no more cgroup events then clear cgrp to avoid + * stale pointer in update_cgrp_time_from_cpuctx(). */ if (!ctx->nr_cgroups) cpuctx->cgrp = NULL; @@ -1489,45 +1669,11 @@ out: perf_event__header_size(tmp); } -/* - * User event without the task. - */ static bool is_orphaned_event(struct perf_event *event) { - return event && !is_kernel_event(event) && !event->owner; -} - -/* - * Event has a parent but parent's task finished and it's - * alive only because of children holding refference. - */ -static bool is_orphaned_child(struct perf_event *event) -{ - return is_orphaned_event(event->parent); -} - -static void orphans_remove_work(struct work_struct *work); - -static void schedule_orphans_remove(struct perf_event_context *ctx) -{ - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) - return; - - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { - get_ctx(ctx); - ctx->orphans_remove_sched = true; - } -} - -static int __init perf_workqueue_init(void) -{ - perf_wq = create_singlethread_workqueue("perf"); - WARN(!perf_wq, "failed to create perf workqueue\n"); - return perf_wq ? 0 : -1; + return event->state == PERF_EVENT_STATE_DEAD; } -core_initcall(perf_workqueue_init); - static inline int pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; @@ -1570,14 +1716,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1588,9 +1734,6 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - perf_pmu_enable(event->pmu); } @@ -1614,10 +1757,7 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -struct remove_event { - struct perf_event *event; - bool detach_group; -}; +#define DETACH_GROUP 0x01UL /* * Cross CPU call to remove a performance event @@ -1625,34 +1765,31 @@ struct remove_event { * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static int __perf_remove_from_context(void *info) +static void +__perf_remove_from_context(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + unsigned long flags = (unsigned long)info; - raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); - if (re->detach_group) + if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + + if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; - cpuctx->task_ctx = NULL; + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + cpuctx->task_ctx = NULL; + } } - raw_spin_unlock(&ctx->lock); - - return 0; } - /* * Remove the event from a task's (or a CPU's) list of events. * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since @@ -1660,96 +1797,32 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event, bool detach_group) +static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { - struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - struct remove_event re = { - .event = event, - .detach_group = detach_group, - }; - - lockdep_assert_held(&ctx->mutex); + lockdep_assert_held(&event->ctx->mutex); - if (!task) { - /* - * Per cpu events are removed via an smp call. The removal can - * fail if the CPU is currently offline, but in that case we - * already called __perf_remove_from_context from - * perf_event_exit_cpu. - */ - cpu_function_call(event->cpu, __perf_remove_from_context, &re); - return; - } - -retry: - if (!task_function_call(task, __perf_remove_from_context, &re)) - return; - - raw_spin_lock_irq(&ctx->lock); - /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. - */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - goto retry; - } - - /* - * Since the task isn't running, its safe to remove the event, us - * holding the ctx->lock ensures the task won't get scheduled in. - */ - if (detach_group) - perf_group_detach(event); - list_del_event(event, ctx); - raw_spin_unlock_irq(&ctx->lock); + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ -int __perf_event_disable(void *info) +static void __perf_event_disable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE) + return; - return 0; + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } /* @@ -1760,7 +1833,8 @@ int __perf_event_disable(void *info) * remains valid. This condition is satisifed when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. + * goes to exit will block in perf_event_exit_event(). + * * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. @@ -1768,43 +1842,20 @@ int __perf_event_disable(void *info) static void _perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - - if (!task) { - /* - * Disable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_disable, event); - return; - } - -retry: - if (!task_function_call(task, __perf_event_disable, event)) - return; raw_spin_lock_irq(&ctx->lock); - /* - * If the event is still active, we need to retry the cross-call. - */ - if (event->state == PERF_EVENT_STATE_ACTIVE) { + if (event->state <= PERF_EVENT_STATE_OFF) { raw_spin_unlock_irq(&ctx->lock); - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; - goto retry; - } - - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; + return; } raw_spin_unlock_irq(&ctx->lock); + + event_function_call(event, __perf_event_disable, NULL); +} + +void perf_event_disable_local(struct perf_event *event) +{ + event_function_local(event, __perf_event_disable, NULL); } /* @@ -1917,9 +1968,6 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - out: perf_pmu_enable(event->pmu); @@ -2038,13 +2086,27 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_event_context *ctx); +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, EVENT_ALL); +} + static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) @@ -2057,10 +2119,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } +static void ctx_resched(struct perf_cpu_context *cpuctx, + struct perf_event_context *task_ctx) +{ + perf_pmu_disable(cpuctx->ctx.pmu); + if (task_ctx) + task_ctx_sched_out(cpuctx, task_ctx); + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + perf_event_sched_in(cpuctx, task_ctx, current); + perf_pmu_enable(cpuctx->ctx.pmu); +} + /* * Cross CPU call to install and enable a performance event * - * Must be called with ctx->mutex held + * Very similar to remote_function() + event_function() but cannot assume that + * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { @@ -2068,72 +2142,59 @@ static int __perf_install_in_context(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - struct task_struct *task = current; - - perf_ctx_lock(cpuctx, task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - /* - * If there was an active task_ctx schedule it out. - */ - if (task_ctx) - task_ctx_sched_out(task_ctx); + bool activate = true; + int ret = 0; - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - if (task_ctx) - raw_spin_unlock(&task_ctx->lock); + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; - } - if (task_ctx) { - cpuctx->task_ctx = task_ctx; - task = task_ctx->task; - } - - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* If we're on the wrong CPU, try again */ + if (task_cpu(ctx->task) != smp_processor_id()) { + ret = -ESRCH; + goto unlock; + } - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); + /* + * If we're on the right CPU, see if the task we target is + * current, if not we don't have to activate the ctx, a future + * context switch will do that for us. + */ + if (ctx->task != current) + activate = false; + else + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); - add_event_to_ctx(event, ctx); + } else if (task_ctx) { + raw_spin_lock(&task_ctx->lock); + } - /* - * Schedule everything back in - */ - perf_event_sched_in(cpuctx, task_ctx, task); + if (activate) { + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + add_event_to_ctx(event, ctx); + ctx_resched(cpuctx, task_ctx); + } else { + add_event_to_ctx(event, ctx); + } - perf_pmu_enable(cpuctx->ctx.pmu); +unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } /* - * Attach a performance event to a context - * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. + * Attach a performance event to a context. * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. + * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { - struct task_struct *task = ctx->task; + struct task_struct *task = READ_ONCE(ctx->task); lockdep_assert_held(&ctx->mutex); @@ -2142,39 +2203,45 @@ perf_install_in_context(struct perf_event_context *ctx, event->cpu = cpu; if (!task) { - /* - * Per cpu events are installed via an smp call and - * the install is always successful. - */ cpu_function_call(cpu, __perf_install_in_context, event); return; } -retry: - if (!task_function_call(task, __perf_install_in_context, event)) + /* + * Should not happen, we validate the ctx is still alive before calling. + */ + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) return; - raw_spin_lock_irq(&ctx->lock); /* - * If we failed to find a running task, but find the context active now - * that we've acquired the ctx->lock, retry. + * Installing events is tricky because we cannot rely on ctx->is_active + * to be set in case this is the nr_events 0 -> 1 transition. */ - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); +again: + /* + * Cannot use task_function_call() because we need to run on the task's + * CPU regardless of whether its current or not. + */ + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). + * Cannot happen because we already checked above (which also + * cannot happen), and we hold ctx->mutex, which serializes us + * against perf_event_exit_task_context(). */ - task = ctx->task; - goto retry; + raw_spin_unlock_irq(&ctx->lock); + return; } - + raw_spin_unlock_irq(&ctx->lock); /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. + * Since !ctx->is_active doesn't mean anything, we must IPI + * unconditionally. */ - add_event_to_ctx(event, ctx); - raw_spin_unlock_irq(&ctx->lock); + goto again; } /* @@ -2201,80 +2268,47 @@ static void __perf_event_mark_enabled(struct perf_event *event) /* * Cross CPU call to enable a performance event */ -static int __perf_event_enable(void *info) +static void __perf_event_enable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; - - /* - * There's a time window between 'ctx->is_active' check - * in perf_event_enable function and this place having: - * - IRQs on - * - ctx->lock unlocked - * - * where the task could be killed and 'ctx' deactivated - * by perf_event_exit_task. - */ - if (!ctx->is_active) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - update_context_time(ctx); + struct perf_event_context *task_ctx; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state <= PERF_EVENT_STATE_ERROR) + return; - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); + if (ctx->is_active) + ctx_sched_out(ctx, cpuctx, EVENT_TIME); __perf_event_mark_enabled(event); + if (!ctx->is_active) + return; + if (!event_filter_match(event)) { if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - goto unlock; + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + return; } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); - } - - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) { - group_sched_out(leader, cpuctx, ctx); - perf_mux_hrtimer_restart(cpuctx); - } - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + return; } -unlock: - raw_spin_unlock(&ctx->lock); + task_ctx = cpuctx->task_ctx; + if (ctx->task) + WARN_ON_ONCE(task_ctx != ctx); - return 0; + ctx_resched(cpuctx, task_ctx); } /* @@ -2289,58 +2323,26 @@ unlock: static void _perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; - if (!task) { - /* - * Enable the event on the cpu that it's on - */ - cpu_function_call(event->cpu, __perf_event_enable, event); + raw_spin_lock_irq(&ctx->lock); + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state < PERF_EVENT_STATE_ERROR) { + raw_spin_unlock_irq(&ctx->lock); return; } - raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto out; - /* * If the event is in error state, clear that first. - * That way, if we see the event in error state below, we - * know that it has gone back into error state, as distinct - * from the task having been scheduled away before the - * cross-call arrived. + * + * That way, if we see the event in error state below, we know that it + * has gone back into error state, as distinct from the task having + * been scheduled away before the cross-call arrived. */ if (event->state == PERF_EVENT_STATE_ERROR) event->state = PERF_EVENT_STATE_OFF; - -retry: - if (!ctx->is_active) { - __perf_event_mark_enabled(event); - goto out; - } - raw_spin_unlock_irq(&ctx->lock); - if (!task_function_call(task, __perf_event_enable, event)) - return; - - raw_spin_lock_irq(&ctx->lock); - - /* - * If the context is active and the event is still off, - * we need to retry the cross-call. - */ - if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) { - /* - * task could have been flipped by a concurrent - * perf_event_context_sched_out() - */ - task = ctx->task; - goto retry; - } - -out: - raw_spin_unlock_irq(&ctx->lock); + event_function_call(event, __perf_event_enable, NULL); } /* @@ -2390,25 +2392,59 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - struct perf_event *event; int is_active = ctx->is_active; + struct perf_event *event; - ctx->is_active &= ~event_type; - if (likely(!ctx->nr_events)) + lockdep_assert_held(&ctx->lock); + + if (likely(!ctx->nr_events)) { + /* + * See __perf_remove_from_context(). + */ + WARN_ON_ONCE(ctx->is_active); + if (ctx->task) + WARN_ON_ONCE(cpuctx->task_ctx); return; + } - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; + } + + /* + * Always update time if it was set; not only when it changes. + * Otherwise we can 'forget' to update time for any but the last + * context we sched out. For example: + * + * ctx_sched_out(.event_type = EVENT_FLEXIBLE) + * ctx_sched_out(.event_type = EVENT_PINNED) + * + * would only update time for the pinned events. + */ + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + + is_active ^= ctx->is_active; /* changed bits */ + + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { + if (is_active & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { + if (is_active & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2566,17 +2602,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; + WRITE_ONCE(ctx->task, next); + WRITE_ONCE(next_ctx->task, task); swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * RCU_INIT_POINTER here is safe because we've not + * modified the ctx and the above modification of + * ctx->task and ctx->task_ctx_data are immaterial + * since those values are always verified under + * ctx->lock which we're now holding. + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -2589,8 +2629,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; + task_ctx_sched_out(cpuctx, ctx); raw_spin_unlock(&ctx->lock); } } @@ -2685,20 +2724,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_event_context *ctx) -{ - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; -} - /* * Called with IRQs disabled */ @@ -2773,25 +2798,40 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type, struct task_struct *task) { - u64 now; int is_active = ctx->is_active; + u64 now; + + lockdep_assert_held(&ctx->lock); - ctx->is_active |= event_type; if (likely(!ctx->nr_events)) return; - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); + ctx->is_active |= (event_type | EVENT_TIME); + if (ctx->task) { + if (!is_active) + cpuctx->task_ctx = ctx; + else + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + } + + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) + if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) + if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } @@ -2821,12 +2861,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * cpu flexible, task flexible. */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - if (ctx->nr_events) - cpuctx->task_ctx = ctx; - - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); - + perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); } @@ -2848,6 +2883,16 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; + /* + * If cgroup events exist on this CPU, then we need to check if we have + * to switch in PMU state; cgroup event are system-wide mode only. + * + * Since cgroup events are CPU events, we must schedule these in before + * we schedule in the task events. + */ + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_sched_in(prev, task); + for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -2855,13 +2900,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, perf_event_context_sched_in(ctx, task); } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); @@ -3099,17 +3137,6 @@ done: return rotate; } -#ifdef CONFIG_NO_HZ_FULL -bool perf_event_can_stop_tick(void) -{ - if (atomic_read(&nr_freq_events) || - __this_cpu_read(perf_throttled_count)) - return false; - else - return true; -} -#endif - void perf_event_task_tick(void) { struct list_head *head = this_cpu_ptr(&active_ctx_list); @@ -3120,6 +3147,7 @@ void perf_event_task_tick(void) __this_cpu_inc(perf_throttled_seq); throttled = __this_cpu_xchg(perf_throttled_count, 0); + tick_dep_clear_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); list_for_each_entry_safe(ctx, tmp, head, active_ctx_list) perf_adjust_freq_unthr_context(ctx, throttled); @@ -3144,48 +3172,34 @@ static int event_enable_on_exec(struct perf_event *event, * Enable all of a task's events that have been marked enable-on-exec. * This expects task == current. */ -static void perf_event_enable_on_exec(struct perf_event_context *ctx) +static void perf_event_enable_on_exec(int ctxn) { - struct perf_event_context *clone_ctx = NULL; + struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; - int ret; local_irq_save(flags); + ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current, NULL); - - raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx); - - list_for_each_entry(event, &ctx->event_list, event_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } + cpuctx = __get_cpu_context(ctx); + perf_ctx_lock(cpuctx, ctx); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + list_for_each_entry(event, &ctx->event_list, event_entry) + enabled |= event_enable_on_exec(event, ctx); /* - * Unclone this context if we enabled any event. + * Unclone and reschedule this context if we enabled any event. */ - if (enabled) + if (enabled) { clone_ctx = unclone_ctx(ctx); + ctx_resched(cpuctx, ctx); + } + perf_ctx_unlock(cpuctx, ctx); - raw_spin_unlock(&ctx->lock); - - /* - * Also calls ctxswin for cgroup events, if any: - */ - perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); @@ -3195,17 +3209,11 @@ out: void perf_event_exec(void) { - struct perf_event_context *ctx; int ctxn; rcu_read_lock(); - for_each_task_context_nr(ctxn) { - ctx = current->perf_event_ctxp[ctxn]; - if (!ctx) - continue; - - perf_event_enable_on_exec(ctx); - } + for_each_task_context_nr(ctxn) + perf_event_enable_on_exec(ctxn); rcu_read_unlock(); } @@ -3387,7 +3395,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3429,7 +3436,7 @@ find_lively_task_by_vpid(pid_t vpid) /* Reuse ptrace permission checks for now. */ err = -EACCES; - if (!ptrace_may_access(task, PTRACE_MODE_READ)) + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) goto errout; return task; @@ -3572,13 +3579,37 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) atomic_dec(&per_cpu(perf_cgroup_events, cpu)); } +#ifdef CONFIG_NO_HZ_FULL +static DEFINE_SPINLOCK(nr_freq_lock); +#endif + +static void unaccount_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + spin_lock(&nr_freq_lock); + if (atomic_dec_and_test(&nr_freq_events)) + tick_nohz_dep_clear(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void unaccount_freq_event(void) +{ + if (tick_nohz_full_enabled()) + unaccount_freq_event_nohz(); + else + atomic_dec(&nr_freq_events); +} + static void unaccount_event(struct perf_event *event) { + bool dec = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -3586,19 +3617,32 @@ static void unaccount_event(struct perf_event *event) if (event->attr.task) atomic_dec(&nr_task_events); if (event->attr.freq) - atomic_dec(&nr_freq_events); + unaccount_freq_event(); if (event->attr.context_switch) { - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (has_branch_stack(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; + + if (dec) { + if (!atomic_add_unless(&perf_sched_count, -1, 1)) + schedule_delayed_work(&perf_sched_work, HZ); + } unaccount_event_cpu(event, event->cpu); } +static void perf_sched_delayed(struct work_struct *work) +{ + mutex_lock(&perf_sched_mutex); + if (atomic_dec_and_test(&perf_sched_count)) + static_branch_disable(&perf_sched_events); + mutex_unlock(&perf_sched_mutex); +} + /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled @@ -3609,7 +3653,7 @@ static void unaccount_event(struct perf_event *event) * 3) two matching events on the same context. * * The former two cases are handled in the allocation path (perf_event_alloc(), - * __free_event()), the latter -- before the first perf_install_in_context(). + * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { @@ -3684,29 +3728,6 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void __free_event(struct perf_event *event) -{ - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } - - call_rcu(&event->rcu_head, free_event_rcu); -} - static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3728,7 +3749,25 @@ static void _free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } + + perf_event_free_bpf_prog(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + if (event->pmu) { + exclusive_event_destroy(event); + module_put(event->pmu->module); + } + + call_rcu(&event->rcu_head, free_event_rcu); } /* @@ -3755,14 +3794,13 @@ static void perf_remove_from_owner(struct perf_event *event) struct task_struct *owner; rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on + * Matches the smp_store_release() in perf_event_exit_task(). If we + * observe !owner it means the list deletion is complete and we can + * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - smp_read_barrier_depends(); + owner = lockless_dereference(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -3790,8 +3828,10 @@ static void perf_remove_from_owner(struct perf_event *event) * ensured they're done, and we can proceed with freeing the * event. */ - if (event->owner) + if (event->owner) { list_del_init(&event->owner_entry); + smp_store_release(&event->owner, NULL); + } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } @@ -3799,37 +3839,111 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { - struct perf_event_context *ctx; - if (!atomic_long_dec_and_test(&event->refcount)) return; + _free_event(event); +} + +/* + * Kill an event dead; while event:refcount will preserve the event + * object, it will not preserve its functionality. Once the last 'user' + * gives up the object, we'll destroy the thing. + */ +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *child, *tmp; + + /* + * If we got here through err_file: fput(event_file); we will not have + * attached to a context yet. + */ + if (!ctx) { + WARN_ON_ONCE(event->attach_state & + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); + goto no_ctx; + } + if (!is_kernel_event(event)) perf_remove_from_owner(event); + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(ctx->parent_ctx); + perf_remove_from_context(event, DETACH_GROUP); + + raw_spin_lock_irq(&ctx->lock); /* - * There are two ways this annotation is useful: + * Mark this even as STATE_DEAD, there is no external reference to it + * anymore. * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. + * Anybody acquiring event->child_mutex after the below loop _must_ + * also see this, most importantly inherit_event() which will avoid + * placing more children on the list. * - * 2) there is a lock-inversion with mmap_sem through - * perf_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. + * Thus this guarantees that we will in fact observe and kill _ALL_ + * child events. */ - ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); - WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, true); + event->state = PERF_EVENT_STATE_DEAD; + raw_spin_unlock_irq(&ctx->lock); + perf_event_ctx_unlock(event, ctx); - _free_event(event); -} +again: + mutex_lock(&event->child_mutex); + list_for_each_entry(child, &event->child_list, child_list) { -int perf_event_release_kernel(struct perf_event *event) -{ - put_event(event); + /* + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ + ctx = lockless_dereference(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. + * + * Since the event cannot get freed while we hold the + * child_mutex, the context must also exist and have a !0 + * reference count. + */ + get_ctx(ctx); + + /* + * Now that we have a ctx ref, we can drop child_mutex, and + * acquire ctx::mutex without fear of it going away. Then we + * can re-acquire child_mutex. + */ + mutex_unlock(&event->child_mutex); + mutex_lock(&ctx->mutex); + mutex_lock(&event->child_mutex); + + /* + * Now that we hold ctx::mutex and child_mutex, revalidate our + * state, if child is still the first entry, it didn't get freed + * and we can continue doing so. + */ + tmp = list_first_entry_or_null(&event->child_list, + struct perf_event, child_list); + if (tmp == child) { + perf_remove_from_context(child, DETACH_GROUP); + list_del(&child->child_list); + free_event(child); + /* + * This matches the refcount bump in inherit_event(); + * this can't be the last reference. + */ + put_event(event); + } + + mutex_unlock(&event->child_mutex); + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + mutex_unlock(&event->child_mutex); + +no_ctx: + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -3839,46 +3953,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { - put_event(file->private_data); + perf_event_release_kernel(file->private_data); return 0; } -/* - * Remove all orphanes events from the context. - */ -static void orphans_remove_work(struct work_struct *work) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = container_of(work, struct perf_event_context, - orphans_remove.work); - - mutex_lock(&ctx->mutex); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { - struct perf_event *parent_event = event->parent; - - if (!is_orphaned_child(event)) - continue; - - perf_remove_from_context(event, true); - - mutex_lock(&parent_event->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent_event->child_mutex); - - free_event(event); - put_event(parent_event); - } - - raw_spin_lock_irq(&ctx->lock); - ctx->orphans_remove_sched = false; - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -4022,7 +4100,7 @@ static bool is_event_hup(struct perf_event *event) { bool no_children; - if (event->state != PERF_EVENT_STATE_EXIT) + if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); @@ -4107,7 +4185,7 @@ static void _perf_event_reset(struct perf_event *event) /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the + * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, @@ -4139,20 +4217,14 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -struct period_event { - struct perf_event *event; - u64 value; -}; - -static int __perf_event_period(void *info) +static void __perf_event_period(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct period_event *pe = info; - struct perf_event *event = pe->event; - struct perf_event_context *ctx = event->ctx; - u64 value = pe->value; + u64 value = *((u64 *)info); bool active; - raw_spin_lock(&ctx->lock); if (event->attr.freq) { event->attr.sample_freq = value; } else { @@ -4163,6 +4235,14 @@ static int __perf_event_period(void *info) active = (event->state == PERF_EVENT_STATE_ACTIVE); if (active) { perf_pmu_disable(ctx->pmu); + /* + * We could be throttled; unthrottle now to avoid the tick + * trying to unthrottle while we already re-started the event. + */ + if (event->hw.interrupts == MAX_INTERRUPTS) { + event->hw.interrupts = 0; + perf_log_throttle(event, 1); + } event->pmu->stop(event, PERF_EF_UPDATE); } @@ -4172,16 +4252,10 @@ static int __perf_event_period(void *info) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } - raw_spin_unlock(&ctx->lock); - - return 0; } static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct period_event pe = { .event = event, }; - struct perf_event_context *ctx = event->ctx; - struct task_struct *task; u64 value; if (!is_sampling_event(event)) @@ -4196,27 +4270,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; - task = ctx->task; - pe.value = value; - - if (!task) { - cpu_function_call(event->cpu, __perf_event_period, &pe); - return 0; - } - -retry: - if (!task_function_call(task, __perf_event_period, &pe)) - return 0; - - raw_spin_lock_irq(&ctx->lock); - if (ctx->is_active) { - raw_spin_unlock_irq(&ctx->lock); - task = ctx->task; - goto retry; - } - - __perf_event_period(&pe); - raw_spin_unlock_irq(&ctx->lock); + event_function_call(event, __perf_event_period, &value); return 0; } @@ -4928,9 +4982,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval < 0) return retval; @@ -4988,7 +5042,7 @@ static void perf_pending_event(struct irq_work *entry) if (event->pending_disable) { event->pending_disable = 0; - __perf_event_disable(event); + perf_event_disable_local(event); } if (event->pending_wakeup) { @@ -5666,6 +5720,17 @@ perf_event_aux_ctx(struct perf_event_context *ctx, } static void +perf_event_aux_task_ctx(perf_event_aux_output_cb output, void *data, + struct perf_event_context *task_ctx) +{ + rcu_read_lock(); + preempt_disable(); + perf_event_aux_ctx(task_ctx, output, data); + preempt_enable(); + rcu_read_unlock(); +} + +static void perf_event_aux(perf_event_aux_output_cb output, void *data, struct perf_event_context *task_ctx) { @@ -5674,14 +5739,23 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, struct pmu *pmu; int ctxn; + /* + * If we have task_ctx != NULL we only notify + * the task context itself. The task_ctx is set + * only for EXIT events before releasing task + * context. + */ + if (task_ctx) { + perf_event_aux_task_ctx(output, data, task_ctx); + return; + } + rcu_read_lock(); list_for_each_entry_rcu(pmu, &pmus, entry) { cpuctx = get_cpu_ptr(pmu->pmu_cpu_context); if (cpuctx->unique_pmu != pmu) goto next; perf_event_aux_ctx(&cpuctx->ctx, output, data); - if (task_ctx) - goto next; ctxn = pmu->task_ctx_nr; if (ctxn < 0) goto next; @@ -5691,12 +5765,6 @@ perf_event_aux(perf_event_aux_output_cb output, void *data, next: put_cpu_ptr(pmu->pmu_cpu_context); } - - if (task_ctx) { - preempt_disable(); - perf_event_aux_ctx(task_ctx, output, data); - preempt_enable(); - } rcu_read_unlock(); } @@ -6401,9 +6469,9 @@ static int __perf_event_overflow(struct perf_event *event, if (unlikely(throttle && hwc->interrupts >= max_samples_per_tick)) { __this_cpu_inc(perf_throttled_count); + tick_dep_set_cpu(smp_processor_id(), TICK_DEP_BIT_PERF_EVENTS); hwc->interrupts = MAX_INTERRUPTS; perf_log_throttle(event, 0); - tick_nohz_full_kick(); ret = 1; } } @@ -6462,9 +6530,6 @@ struct swevent_htable { /* Recursion avoidance in each contexts */ int recursion[PERF_NR_CONTEXTS]; - - /* Keeps track of cpu being initialized/exited */ - bool online; }; static DEFINE_PER_CPU(struct swevent_htable, swevent_htable); @@ -6722,14 +6787,8 @@ static int perf_swevent_add(struct perf_event *event, int flags) hwc->state = !(flags & PERF_EF_START); head = find_swevent_head(swhash, event); - if (!head) { - /* - * We can race with cpu hotplug code. Do not - * WARN if the cpu just got unplugged. - */ - WARN_ON_ONCE(swhash->online); + if (WARN_ON_ONCE(!head)) return -EINVAL; - } hlist_add_head_rcu(&event->hlist_entry, head); perf_event_update_userpage(event); @@ -6771,7 +6830,7 @@ static void swevent_hlist_release(struct swevent_htable *swhash) kfree_rcu(hlist, rcu_head); } -static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) +static void swevent_hlist_put_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); @@ -6783,21 +6842,20 @@ static void swevent_hlist_put_cpu(struct perf_event *event, int cpu) mutex_unlock(&swhash->hlist_mutex); } -static void swevent_hlist_put(struct perf_event *event) +static void swevent_hlist_put(void) { int cpu; for_each_possible_cpu(cpu) - swevent_hlist_put_cpu(event, cpu); + swevent_hlist_put_cpu(cpu); } -static int swevent_hlist_get_cpu(struct perf_event *event, int cpu) +static int swevent_hlist_get_cpu(int cpu) { struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); int err = 0; mutex_lock(&swhash->hlist_mutex); - if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) { struct swevent_hlist *hlist; @@ -6815,14 +6873,13 @@ exit: return err; } -static int swevent_hlist_get(struct perf_event *event) +static int swevent_hlist_get(void) { - int err; - int cpu, failed_cpu; + int err, cpu, failed_cpu; get_online_cpus(); for_each_possible_cpu(cpu) { - err = swevent_hlist_get_cpu(event, cpu); + err = swevent_hlist_get_cpu(cpu); if (err) { failed_cpu = cpu; goto fail; @@ -6835,7 +6892,7 @@ fail: for_each_possible_cpu(cpu) { if (cpu == failed_cpu) break; - swevent_hlist_put_cpu(event, cpu); + swevent_hlist_put_cpu(cpu); } put_online_cpus(); @@ -6851,7 +6908,7 @@ static void sw_perf_event_destroy(struct perf_event *event) WARN_ON(event->parent); static_key_slow_dec(&perf_swevent_enabled[event_id]); - swevent_hlist_put(event); + swevent_hlist_put(); } static int perf_swevent_init(struct perf_event *event) @@ -6882,7 +6939,7 @@ static int perf_swevent_init(struct perf_event *event) if (!event->parent) { int err; - err = swevent_hlist_get(event); + err = swevent_hlist_get(); if (err) return err; @@ -6913,6 +6970,10 @@ static int perf_tp_filter_match(struct perf_event *event, { void *record = data->raw->data; + /* only top level events have filters set */ + if (event->parent) + event = event->parent; + if (likely(!event->filter) || filter_match_preds(event->filter, record)) return 1; return 0; @@ -7799,31 +7860,75 @@ static void account_event_cpu(struct perf_event *event, int cpu) atomic_inc(&per_cpu(perf_cgroup_events, cpu)); } +/* Freq events need the tick to stay alive (see perf_event_task_tick). */ +static void account_freq_event_nohz(void) +{ +#ifdef CONFIG_NO_HZ_FULL + /* Lock so we don't race with concurrent unaccount */ + spin_lock(&nr_freq_lock); + if (atomic_inc_return(&nr_freq_events) == 1) + tick_nohz_dep_set(TICK_DEP_BIT_PERF_EVENTS); + spin_unlock(&nr_freq_lock); +#endif +} + +static void account_freq_event(void) +{ + if (tick_nohz_full_enabled()) + account_freq_event_nohz(); + else + atomic_inc(&nr_freq_events); +} + + static void account_event(struct perf_event *event) { + bool inc = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) atomic_inc(&nr_comm_events); if (event->attr.task) atomic_inc(&nr_task_events); - if (event->attr.freq) { - if (atomic_inc_return(&nr_freq_events) == 1) - tick_nohz_full_kick_all(); - } + if (event->attr.freq) + account_freq_event(); if (event->attr.context_switch) { atomic_inc(&nr_switch_events); - static_key_slow_inc(&perf_sched_events.key); + inc = true; } if (has_branch_stack(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (is_cgroup_event(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; + + if (inc) { + if (atomic_inc_not_zero(&perf_sched_count)) + goto enabled; + + mutex_lock(&perf_sched_mutex); + if (!atomic_read(&perf_sched_count)) { + static_branch_enable(&perf_sched_events); + /* + * Guarantee that all CPUs observe they key change and + * call the perf scheduling hooks before proceeding to + * install events that need them. + */ + synchronize_sched(); + } + /* + * Now that we have waited for the sync_sched(), allow further + * increments to by-pass the mutex. + */ + atomic_inc(&perf_sched_count); + mutex_unlock(&perf_sched_mutex); + } +enabled: account_event_cpu(event, event->cpu); } @@ -7959,6 +8064,9 @@ perf_event_alloc(struct perf_event_attr *attr, int cpu, } } + /* symmetric to unaccount_event() in _free_event() */ + account_event(event); + return event; err_per_task: @@ -8322,8 +8430,6 @@ SYSCALL_DEFINE5(perf_event_open, } } - account_event(event); - /* * Special case software events and allow them to be part of * any hardware group. @@ -8436,16 +8542,26 @@ SYSCALL_DEFINE5(perf_event_open, f_flags); if (IS_ERR(event_file)) { err = PTR_ERR(event_file); + event_file = NULL; goto err_context; } if (move_group) { gctx = group_leader->ctx; mutex_lock_double(&gctx->mutex, &ctx->mutex); + if (gctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; @@ -8470,11 +8586,11 @@ SYSCALL_DEFINE5(perf_event_open, * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - perf_remove_from_context(group_leader, false); + perf_remove_from_context(group_leader, 0); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling, false); + perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -8527,6 +8643,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__header_size(event); perf_event__id_header_size(event); + event->owner = current; + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8536,8 +8654,6 @@ SYSCALL_DEFINE5(perf_event_open, put_online_cpus(); - event->owner = current; - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -8562,7 +8678,12 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); err_cpus: put_online_cpus(); err_task: @@ -8604,9 +8725,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } /* Mark owner so we could distinguish it from user events. */ - event->owner = EVENT_OWNER_KERNEL; - - account_event(event); + event->owner = TASK_TOMBSTONE; ctx = find_get_context(event->pmu, task, event); if (IS_ERR(ctx)) { @@ -8616,12 +8735,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_unlock; + } + if (!exclusive_event_installable(event, ctx)) { - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); err = -EBUSY; - goto err_free; + goto err_unlock; } perf_install_in_context(ctx, event, cpu); @@ -8630,6 +8751,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_unlock: + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_free: free_event(event); err: @@ -8654,7 +8779,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event, false); + perf_remove_from_context(event, 0); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -8721,33 +8846,15 @@ static void sync_child_event(struct perf_event *child_event, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Make sure user/parent get notified, that we just - * lost one event. - */ - perf_event_wakeup(parent_event); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - put_event(parent_event); } static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) { + struct perf_event *parent_event = child_event->parent; + /* * Do not destroy the 'original' grouping; because of the context * switch optimization the original events could've ended up in a @@ -8760,59 +8867,86 @@ __perf_event_exit_task(struct perf_event *child_event, * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - perf_remove_from_context(child_event, !!child_event->parent); + raw_spin_lock_irq(&child_ctx->lock); + WARN_ON_ONCE(child_ctx->is_active); + + if (parent_event) + perf_group_detach(child_event); + list_del_event(child_event, child_ctx); + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ + raw_spin_unlock_irq(&child_ctx->lock); /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. + * Parent events are governed by their filedesc, retain them. */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } else { - child_event->state = PERF_EVENT_STATE_EXIT; + if (!parent_event) { perf_event_wakeup(child_event); + return; } + /* + * Child events can be cleaned up. + */ + + sync_child_event(child_event, child); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Kick perf_poll() for is_event_hup(). + */ + perf_event_wakeup(parent_event); + free_event(child_event); + put_event(parent_event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *next; struct perf_event_context *child_ctx, *clone_ctx = NULL; - unsigned long flags; + struct perf_event *child_event, *next; - if (likely(!child->perf_event_ctxp[ctxn])) { - perf_event_task(child, NULL, 0); + WARN_ON_ONCE(child != current); + + child_ctx = perf_pin_task_context(child, ctxn); + if (!child_ctx) return; - } - local_irq_save(flags); /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. + * In order to reduce the amount of tricky in ctx tear-down, we hold + * ctx::mutex over the entire thing. This serializes against almost + * everything that wants to access the ctx. + * + * The exception is sys_perf_event_open() / + * perf_event_create_kernel_count() which does find_get_context() + * without ctx::mutex (it cannot because of the move_group double mutex + * lock thing). See the comments in perf_install_in_context(). */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + mutex_lock(&child_ctx->mutex); /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. + * In a single ctx::lock section, de-schedule the events and detach the + * context from the task such that we cannot ever get it scheduled back + * in. */ - raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx); - child->perf_event_ctxp[ctxn] = NULL; + raw_spin_lock_irq(&child_ctx->lock); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. + * Now that the context is inactive, destroy the task <-> ctx relation + * and mark the context dead. */ + RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + put_ctx(child_ctx); /* cannot be last */ + WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); + put_task_struct(current); /* cannot be last */ + clone_ctx = unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -8824,20 +8958,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ perf_event_task(child, child_ctx, 0); - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * put_event() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - __perf_event_exit_task(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); @@ -8862,13 +8984,20 @@ void perf_event_exit_task(struct task_struct *child) * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ - smp_wmb(); - event->owner = NULL; + smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); for_each_task_context_nr(ctxn) perf_event_exit_task_context(child, ctxn); + + /* + * The perf_event_exit_task_context calls perf_event_task + * with child's task_ctx, which generates EXIT events for + * child contexts and sets child->perf_event_ctxp[] to NULL. + * At this point we need to send EXIT events to cpu contexts. + */ + perf_event_task(child, NULL, 0); } static void perf_free_event(struct perf_event *event, @@ -8938,21 +9067,20 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } -struct perf_event *perf_event_get(unsigned int fd) +struct file *perf_event_get(unsigned int fd) { - int err; - struct fd f; - struct perf_event *event; + struct file *file; - err = perf_fget_light(fd, &f); - if (err) - return ERR_PTR(err); + file = fget_raw(fd); + if (!file) + return ERR_PTR(-EBADF); - event = f.file->private_data; - atomic_long_inc(&event->refcount); - fdput(f); + if (file->f_op != &perf_fops) { + fput(file); + return ERR_PTR(-EBADF); + } - return event; + return file; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) @@ -8995,8 +9123,16 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + /* + * is_orphaned_event() and list_add_tail(&parent_event->child_list) + * must be under the same lock in order to serialize against + * perf_event_release_kernel(), such that either we must observe + * is_orphaned_event() or they will observe us on the child_list. + */ + mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { + mutex_unlock(&parent_event->child_mutex); free_event(child_event); return NULL; } @@ -9044,8 +9180,6 @@ inherit_event(struct perf_event *parent_event, /* * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); @@ -9250,8 +9384,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - swhash->online = true; - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -9264,13 +9397,14 @@ static void perf_event_init_cpu(int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *event; - rcu_read_lock(); - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) - __perf_remove_from_context(&re); - rcu_read_unlock(); + raw_spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->event_list, event_entry) + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); + raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) @@ -9292,14 +9426,7 @@ static void perf_event_exit_cpu_context(int cpu) static void perf_event_exit_cpu(int cpu) { - struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); - perf_event_exit_cpu_context(cpu); - - mutex_lock(&swhash->hlist_mutex); - swhash->online = false; - swevent_hlist_release(swhash); - mutex_unlock(&swhash->hlist_mutex); } #else static inline void perf_event_exit_cpu(int cpu) { } @@ -9333,12 +9460,29 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: + /* + * This must be done before the CPU comes alive, because the + * moment we can run tasks we can encounter (software) events. + * + * Specifically, someone can have inherited events on kthreadd + * or a pre-existing worker thread that gets re-bound. + */ perf_event_init_cpu(cpu); break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: + /* + * This must be done before the CPU dies because after that an + * active event might want to IPI the CPU and that'll not work + * so great for dead CPUs. + * + * XXX smp_call_function_single() return -ENXIO without a warn + * so we could possibly deal with this. + * + * This is safe against new events arriving because + * sys_perf_event_open() serializes against hotplug using + * get_online_cpus(). + */ perf_event_exit_cpu(cpu); break; default: @@ -9366,9 +9510,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); - /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. @@ -9388,6 +9529,7 @@ ssize_t perf_event_sysfs_show(struct device *dev, struct device_attribute *attr, return 0; } +EXPORT_SYMBOL_GPL(perf_event_sysfs_show); static int __init perf_event_sysfs_init(void) { @@ -9447,16 +9589,18 @@ static void perf_cgroup_css_free(struct cgroup_subsys_state *css) static int __perf_cgroup_move(void *info) { struct task_struct *task = info; + rcu_read_lock(); perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN); + rcu_read_unlock(); return 0; } -static void perf_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void perf_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) task_function_call(task, __perf_cgroup_move, task); } diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..3f8cb1e14588 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) - __perf_event_disable(bp); + perf_event_disable_local(bp); else perf_event_disable(bp); diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 2bbad9c1274c..4199b6d193f5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -182,8 +182,6 @@ DEFINE_OUTPUT_COPY(__output_copy_user, arch_perf_out_copy_user) /* Callchain handling */ extern struct perf_callchain_entry * perf_callchain(struct perf_event *event, struct pt_regs *regs); -extern int get_callchain_buffers(void); -extern void put_callchain_buffers(void); static inline int get_recursion_context(int *recursion) { diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index b5d1ea79c595..c61f0cbd308b 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -3,7 +3,7 @@ * * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de> * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar - * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> * * For licensing details see kernel-base/COPYING @@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) __free_page(page); } +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } +} + int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { @@ -547,30 +566,11 @@ out: if (!ret) rb->aux_pgoff = pgoff; else - rb_free_aux(rb); + __rb_free_aux(rb); return ret; } -static void __rb_free_aux(struct ring_buffer *rb) -{ - int pg; - - if (rb->aux_priv) { - rb->free_aux(rb->aux_priv); - rb->free_aux = NULL; - rb->aux_priv = NULL; - } - - if (rb->aux_nr_pages) { - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); - - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; - } -} - void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) @@ -746,8 +746,10 @@ struct ring_buffer *rb_alloc(int nr_pages, long watermark, int cpu, int flags) rb->user_page = all_buf; rb->data_pages[0] = all_buf + PAGE_SIZE; - rb->page_order = ilog2(nr_pages); - rb->nr_pages = !!nr_pages; + if (nr_pages) { + rb->nr_pages = 1; + rb->page_order = ilog2(nr_pages); + } ring_buffer_init(rb, watermark, flags); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 4e5e9798aa0c..7edc95edfaee 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ * Authors: * Srikar Dronamraju * Jim Keniston - * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2011-2012 Red Hat, Inc., Peter Zijlstra */ #include <linux/kernel.h> @@ -161,7 +161,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, const unsigned long mmun_end = addr + PAGE_SIZE; struct mem_cgroup *memcg; - err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg); + err = mem_cgroup_try_charge(kpage, vma->vm_mm, GFP_KERNEL, &memcg, + false); if (err) return err; @@ -175,12 +176,12 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, goto unlock; get_page(kpage); - page_add_new_anon_rmap(kpage, vma, addr); - mem_cgroup_commit_charge(kpage, memcg, false); + page_add_new_anon_rmap(kpage, vma, addr, false); + mem_cgroup_commit_charge(kpage, memcg, false, false); lru_cache_add_active_or_unevictable(kpage, vma); if (!PageAnon(page)) { - dec_mm_counter(mm, MM_FILEPAGES); + dec_mm_counter(mm, mm_counter_file(page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -188,7 +189,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, ptep_clear_flush_notify(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, mk_pte(kpage, vma->vm_page_prot)); - page_remove_rmap(page); + page_remove_rmap(page, false); if (!page_mapped(page)) try_to_free_swap(page); pte_unmap_unlock(ptep, ptl); @@ -199,7 +200,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, err = 0; unlock: - mem_cgroup_cancel_charge(kpage, memcg); + mem_cgroup_cancel_charge(kpage, memcg, false); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); unlock_page(page); return err; @@ -298,7 +299,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ - ret = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); + ret = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &old_page, &vma); if (ret <= 0) return ret; @@ -320,7 +321,7 @@ retry: copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE); ret = __replace_page(vma, vaddr, old_page, new_page); - page_cache_release(new_page); + put_page(new_page); put_old: put_page(old_page); @@ -538,14 +539,14 @@ static int __copy_insn(struct address_space *mapping, struct file *filp, * see uprobe_register(). */ if (mapping->a_ops->readpage) - page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp); + page = read_mapping_page(mapping, offset >> PAGE_SHIFT, filp); else - page = shmem_read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT); + page = shmem_read_mapping_page(mapping, offset >> PAGE_SHIFT); if (IS_ERR(page)) return PTR_ERR(page); copy_from_page(page, offset, insn, nbytes); - page_cache_release(page); + put_page(page); return 0; } @@ -1177,6 +1178,7 @@ static struct xol_area *__create_xol_area(unsigned long vaddr) goto free_area; area->xol_mapping.name = "[uprobes]"; + area->xol_mapping.fault = NULL; area->xol_mapping.pages = area->pages; area->pages[0] = alloc_page(GFP_HIGHUSER); if (!area->pages[0]) @@ -1699,7 +1701,13 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) if (likely(result == 0)) goto out; - result = get_user_pages(NULL, mm, vaddr, 1, 0, 1, &page, NULL); + /* + * The NULL 'tsk' here ensures that any faults that occur here + * will not be accounted to the task. 'mm' *is* current->mm, + * but we treat this as a 'remote' access since it is + * essentially a kernel access to the memory. + */ + result = get_user_pages_remote(NULL, mm, vaddr, 1, 0, 1, &page, NULL); if (result < 0) return result; diff --git a/kernel/exit.c b/kernel/exit.c index 07110c6020a0..fd90195667e1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -53,14 +53,13 @@ #include <linux/oom.h> #include <linux/writeback.h> #include <linux/shm.h> +#include <linux/kcov.h> #include <asm/uaccess.h> #include <asm/unistd.h> #include <asm/pgtable.h> #include <asm/mmu_context.h> -static void exit_mm(struct task_struct *tsk); - static void __unhash_process(struct task_struct *p, bool group_dead) { nr_threads--; @@ -436,7 +435,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(); + exit_oom_victim(tsk); } static struct task_struct *find_alive_thread(struct task_struct *p) @@ -657,6 +656,7 @@ void do_exit(long code) TASKS_RCU(int tasks_rcu_i); profile_task_exit(tsk); + kcov_task_exit(tsk); WARN_ON(blk_needs_flush_plug(tsk)); @@ -1120,8 +1120,7 @@ static int wait_task_zombie(struct wait_opts *wo, struct task_struct *p) static int *task_stopped_code(struct task_struct *p, bool ptrace) { if (ptrace) { - if (task_is_stopped_or_traced(p) && - !(p->jobctl & JOBCTL_LISTENING)) + if (task_is_traced(p) && !(p->jobctl & JOBCTL_LISTENING)) return &p->exit_code; } else { if (p->signal->flags & SIGNAL_STOP_STOPPED) diff --git a/kernel/fork.c b/kernel/fork.c index f97f2c449f5c..d277e83ed3e0 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -75,6 +75,7 @@ #include <linux/aio.h> #include <linux/compiler.h> #include <linux/sysctl.h> +#include <linux/kcov.h> #include <asm/pgtable.h> #include <asm/pgalloc.h> @@ -164,12 +165,20 @@ static struct thread_info *alloc_thread_info_node(struct task_struct *tsk, struct page *page = alloc_kmem_pages_node(node, THREADINFO_GFP, THREAD_SIZE_ORDER); + if (page) + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + 1 << THREAD_SIZE_ORDER); + return page ? page_address(page) : NULL; } static inline void free_thread_info(struct thread_info *ti) { - free_kmem_pages((unsigned long)ti, THREAD_SIZE_ORDER); + struct page *page = virt_to_page(ti); + + memcg_kmem_update_page_stat(page, MEMCG_KERNEL_STACK, + -(1 << THREAD_SIZE_ORDER)); + __free_kmem_pages(page, THREAD_SIZE_ORDER); } # else static struct kmem_cache *thread_info_cache; @@ -300,9 +309,9 @@ void __init fork_init(void) #define ARCH_MIN_TASKALIGN L1_CACHE_BYTES #endif /* create a slab on which task_structs can be allocated */ - task_struct_cachep = - kmem_cache_create("task_struct", arch_task_struct_size, - ARCH_MIN_TASKALIGN, SLAB_PANIC | SLAB_NOTRACK, NULL); + task_struct_cachep = kmem_cache_create("task_struct", + arch_task_struct_size, ARCH_MIN_TASKALIGN, + SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, NULL); #endif /* do the arch specific task caches init */ @@ -380,9 +389,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) #endif tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; + tsk->wake_q.next = NULL; account_kernel_stack(ti, 1); + kcov_task_init(tsk); + return tsk; free_ti: @@ -413,7 +425,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); mm->total_vm = oldmm->total_vm; - mm->shared_vm = oldmm->shared_vm; + mm->data_vm = oldmm->data_vm; mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; @@ -432,8 +444,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) struct file *file; if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, mpnt->vm_file, - -vma_pages(mpnt)); + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); continue; } charge = 0; @@ -1249,7 +1260,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, { int retval; struct task_struct *p; - void *cgrp_ss_priv[CGROUP_CANFORK_COUNT] = {}; if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) return ERR_PTR(-EINVAL); @@ -1348,9 +1358,9 @@ static struct task_struct *copy_process(unsigned long clone_flags, prev_cputime_init(&p->prev_cputime); #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN - seqlock_init(&p->vtime_seqlock); + seqcount_init(&p->vtime_seqcount); p->vtime_snap = 0; - p->vtime_snap_whence = VTIME_SLEEPING; + p->vtime_snap_whence = VTIME_INACTIVE; #endif #if defined(SPLIT_RSS_COUNTING) @@ -1368,8 +1378,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - if (clone_flags & CLONE_THREAD) - threadgroup_change_begin(current); + threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1527,7 +1536,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, * between here and cgroup_post_fork() if an organisation operation is in * progress. */ - retval = cgroup_can_fork(p, cgrp_ss_priv); + retval = cgroup_can_fork(p); if (retval) goto bad_fork_free_pid; @@ -1609,9 +1618,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, write_unlock_irq(&tasklist_lock); proc_fork_connector(p); - cgroup_post_fork(p, cgrp_ss_priv); - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + cgroup_post_fork(p); + threadgroup_change_end(current); perf_event_fork(p); trace_task_newtask(p, clone_flags); @@ -1620,7 +1628,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, return p; bad_fork_cancel_cgroup: - cgroup_cancel_fork(p, cgrp_ss_priv); + cgroup_cancel_fork(p); bad_fork_free_pid: if (pid != &init_struct_pid) free_pid(pid); @@ -1652,8 +1660,7 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - if (clone_flags & CLONE_THREAD) - threadgroup_change_end(current); + threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); @@ -1851,16 +1858,19 @@ void __init proc_caches_init(void) sighand_cachep = kmem_cache_create("sighand_cache", sizeof(struct sighand_struct), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_DESTROY_BY_RCU| - SLAB_NOTRACK, sighand_ctor); + SLAB_NOTRACK|SLAB_ACCOUNT, sighand_ctor); signal_cachep = kmem_cache_create("signal_cache", sizeof(struct signal_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); files_cachep = kmem_cache_create("files_cache", sizeof(struct files_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); fs_cachep = kmem_cache_create("fs_cache", sizeof(struct fs_struct), 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); /* * FIXME! The "sizeof(struct mm_struct)" currently includes the * whole struct cpumask for the OFFSTACK case. We could change @@ -1870,8 +1880,9 @@ void __init proc_caches_init(void) */ mm_cachep = kmem_cache_create("mm_struct", sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN, - SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK, NULL); - vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC); + SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_NOTRACK|SLAB_ACCOUNT, + NULL); + vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC|SLAB_ACCOUNT); mmap_init(); nsproxy_cache_init(); } @@ -1884,7 +1895,7 @@ static int check_unshare_flags(unsigned long unshare_flags) if (unshare_flags & ~(CLONE_THREAD|CLONE_FS|CLONE_NEWNS|CLONE_SIGHAND| CLONE_VM|CLONE_FILES|CLONE_SYSVSEM| CLONE_NEWUTS|CLONE_NEWIPC|CLONE_NEWNET| - CLONE_NEWUSER|CLONE_NEWPID)) + CLONE_NEWUSER|CLONE_NEWPID|CLONE_NEWCGROUP)) return -EINVAL; /* * Not implemented, but pretend it works if there is nothing diff --git a/kernel/futex.c b/kernel/futex.c index 684d7549825a..a5d2e74c89e0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -124,16 +124,16 @@ * futex_wait(futex, val); * * waiters++; (a) - * mb(); (A) <-- paired with -. - * | - * lock(hash_bucket(futex)); | - * | - * uval = *futex; | - * | *futex = newval; - * | sys_futex(WAKE, futex); - * | futex_wake(futex); - * | - * `-------> mb(); (B) + * smp_mb(); (A) <-- paired with -. + * | + * lock(hash_bucket(futex)); | + * | + * uval = *futex; | + * | *futex = newval; + * | sys_futex(WAKE, futex); + * | futex_wake(futex); + * | + * `--------> smp_mb(); (B) * if (uval == val) * queue(); * unlock(hash_bucket(futex)); @@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key) /* * Ensure futex_get_mm() implies a full barrier such that * get_futex_key() implies a full barrier. This is relied upon - * as full barrier (B), see the ordering comment above. + * as smp_mb(); (B), see the ordering comment above. */ smp_mb__after_atomic(); } @@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key) switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: - ihold(key->shared.inode); /* implies MB (B) */ + ihold(key->shared.inode); /* implies smp_mb(); (B) */ break; case FUT_OFF_MMSHARED: - futex_get_mm(key); /* implies MB (B) */ + futex_get_mm(key); /* implies smp_mb(); (B) */ break; default: /* @@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key) * mm, therefore the only purpose of calling get_futex_key_refs * is because we need the barrier for the lockless waiter check. */ - smp_mb(); /* explicit MB (B) */ + smp_mb(); /* explicit smp_mb(); (B) */ } } @@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; - struct page *page, *page_head; + struct page *page; + struct address_space *mapping; int err, ro = 0; /* @@ -496,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) if (!fshared) { key->private.mm = mm; key->private.address = address; - get_futex_key_refs(key); /* implies MB (B) */ + get_futex_key_refs(key); /* implies smp_mb(); (B) */ return 0; } @@ -519,46 +520,22 @@ again: else err = 0; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - page_head = page; - if (unlikely(PageTail(page))) { - put_page(page); - /* serialize against __split_huge_page_splitting() */ - local_irq_disable(); - if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) { - page_head = compound_head(page); - /* - * page_head is valid pointer but we must pin - * it before taking the PG_lock and/or - * PG_compound_lock. The moment we re-enable - * irqs __split_huge_page_splitting() can - * return and the head page can be freed from - * under us. We can't take the PG_lock and/or - * PG_compound_lock on a page that could be - * freed from under us. - */ - if (page != page_head) { - get_page(page_head); - put_page(page); - } - local_irq_enable(); - } else { - local_irq_enable(); - goto again; - } - } -#else - page_head = compound_head(page); - if (page != page_head) { - get_page(page_head); - put_page(page); - } -#endif - - lock_page(page_head); + /* + * The treatment of mapping from this point on is critical. The page + * lock protects many things but in this context the page lock + * stabilizes mapping, prevents inode freeing in the shared + * file-backed region case and guards against movement to swap cache. + * + * Strictly speaking the page lock is not needed in all cases being + * considered here and page lock forces unnecessarily serialization + * From this point on, mapping will be re-verified if necessary and + * page lock will be acquired only if it is unavoidable + */ + page = compound_head(page); + mapping = READ_ONCE(page->mapping); /* - * If page_head->mapping is NULL, then it cannot be a PageAnon + * If page->mapping is NULL, then it cannot be a PageAnon * page; but it might be the ZERO_PAGE or in the gate area or * in a special mapping (all cases which we are happy to fail); * or it may have been a good file page when get_user_pages_fast @@ -570,25 +547,38 @@ again: * * The case we do have to guard against is when memory pressure made * shmem_writepage move it from filecache to swapcache beneath us: - * an unlikely race, but we do need to retry for page_head->mapping. + * an unlikely race, but we do need to retry for page->mapping. */ - if (!page_head->mapping) { - int shmem_swizzled = PageSwapCache(page_head); - unlock_page(page_head); - put_page(page_head); + if (unlikely(!mapping)) { + int shmem_swizzled; + + /* + * Page lock is required to identify which special case above + * applies. If this is really a shmem page then the page lock + * will prevent unexpected transitions. + */ + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; + unlock_page(page); + put_page(page); + if (shmem_swizzled) goto again; + return -EFAULT; } /* * Private mappings are handled in a simple way. * + * If the futex key is stored on an anonymous page, then the associated + * object is the mm which is implicitly pinned by the calling process. + * * NOTE: When userspace waits on a MAP_SHARED mapping, even if * it's a read-only handle, it's expected that futexes attach to * the object not the particular process. */ - if (PageAnon(page_head)) { + if (PageAnon(page)) { /* * A RO anonymous page will never change and thus doesn't make * sense for futex operations. @@ -601,17 +591,75 @@ again: key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ key->private.mm = mm; key->private.address = address; + + get_futex_key_refs(key); /* implies smp_mb(); (B) */ + } else { + struct inode *inode; + + /* + * The associated futex object in this case is the inode and + * the page->mapping must be traversed. Ordinarily this should + * be stabilised under page lock but it's not strictly + * necessary in this case as we just want to pin the inode, not + * update the radix tree or anything like that. + * + * The RCU read lock is taken as the inode is finally freed + * under RCU. If the mapping still matches expectations then the + * mapping->host can be safely accessed as being a valid inode. + */ + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* + * Take a reference unless it is about to be freed. Previously + * this reference was taken by ihold under the page lock + * pinning the inode in place so i_lock was unnecessary. The + * only way for this check to fail is if the inode was + * truncated in parallel so warn for now if this happens. + * + * We are not calling into get_futex_key_refs() in file-backed + * cases, therefore a successful atomic_inc return below will + * guarantee that get_futex_key() will still imply smp_mb(); (B). + */ + if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + /* Should be impossible but lets be paranoid for now */ + if (WARN_ON_ONCE(inode->i_mapping != mapping)) { + err = -EFAULT; + rcu_read_unlock(); + iput(inode); + + goto out; + } + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ - key->shared.inode = page_head->mapping->host; + key->shared.inode = inode; key->shared.pgoff = basepage_index(page); + rcu_read_unlock(); } - get_futex_key_refs(key); /* implies MB (B) */ - out: - unlock_page(page_head); - put_page(page_head); + put_page(page); return err; } @@ -639,7 +687,7 @@ static int fault_in_user_writeable(u32 __user *uaddr) down_read(&mm->mmap_sem); ret = fixup_user_fault(current, mm, (unsigned long)uaddr, - FAULT_FLAG_WRITE); + FAULT_FLAG_WRITE, NULL); up_read(&mm->mmap_sem); return ret < 0 ? ret : 0; @@ -725,9 +773,12 @@ static struct futex_pi_state * alloc_pi_state(void) } /* + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. + * * Must be called with the hb lock held. */ -static void free_pi_state(struct futex_pi_state *pi_state) +static void put_pi_state(struct futex_pi_state *pi_state) { if (!pi_state) return; @@ -1223,7 +1274,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (pi_state->owner != current) return -EINVAL; - raw_spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -1249,22 +1300,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else if (curval != uval) ret = -EINVAL; if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); - raw_spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); @@ -1706,31 +1757,35 @@ retry_private: * exist yet, look it up one more time to ensure we have a * reference to it. If the lock was taken, ret contains the * vpid of the top waiter task. + * If the lock was not taken, we have pi_state and an initial + * refcount on it. In case of an error we have nothing. */ if (ret > 0) { WARN_ON(pi_state); drop_count++; task_count++; /* - * If we acquired the lock, then the user - * space value of uaddr2 should be vpid. It - * cannot be changed by the top waiter as it - * is blocked on hb2 lock if it tries to do - * so. If something fiddled with it behind our - * back the pi state lookup might unearth - * it. So we rather use the known value than - * rereading and handing potential crap to - * lookup_pi_state. + * If we acquired the lock, then the user space value + * of uaddr2 should be vpid. It cannot be changed by + * the top waiter as it is blocked on hb2 lock if it + * tries to do so. If something fiddled with it behind + * our back the pi state lookup might unearth it. So + * we rather use the known value than rereading and + * handing potential crap to lookup_pi_state. + * + * If that call succeeds then we have pi_state and an + * initial refcount on it. */ ret = lookup_pi_state(ret, hb2, &key2, &pi_state); } switch (ret) { case 0: + /* We hold a reference on the pi state. */ break; + + /* If the above failed, then pi_state is NULL */ case -EFAULT: - free_pi_state(pi_state); - pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1746,8 +1801,6 @@ retry_private: * exit to complete. * - The user space value changed. */ - free_pi_state(pi_state); - pi_state = NULL; double_unlock_hb(hb1, hb2); hb_waiters_dec(hb2); put_futex_key(&key2); @@ -1801,30 +1854,58 @@ retry_private: * of requeue_pi if we couldn't acquire the lock atomically. */ if (requeue_pi) { - /* Prepare the waiter to take the rt_mutex. */ + /* + * Prepare the waiter to take the rt_mutex. Take a + * refcount on the pi_state and store the pointer in + * the futex_q object of the waiter. + */ atomic_inc(&pi_state->refcount); this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, this->task); if (ret == 1) { - /* We got the lock. */ + /* + * We got the lock. We do neither drop the + * refcount on pi_state nor clear + * this->pi_state because the waiter needs the + * pi_state for cleaning up the user space + * value. It will drop the refcount after + * doing so. + */ requeue_pi_wake_futex(this, &key2, hb2); drop_count++; continue; } else if (ret) { - /* -EDEADLK */ + /* + * rt_mutex_start_proxy_lock() detected a + * potential deadlock when we tried to queue + * that waiter. Drop the pi_state reference + * which we took above and remove the pointer + * to the state from the waiters futex_q + * object. + */ this->pi_state = NULL; - free_pi_state(pi_state); - goto out_unlock; + put_pi_state(pi_state); + /* + * We stop queueing more waiters and let user + * space deal with the mess. + */ + break; } } requeue_futex(this, hb1, hb2, &key2); drop_count++; } + /* + * We took an extra initial reference to the pi_state either + * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We + * need to drop it here again. + */ + put_pi_state(pi_state); + out_unlock: - free_pi_state(pi_state); double_unlock_hb(hb1, hb2); wake_up_q(&wake_q); hb_waiters_dec(hb2); @@ -1866,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) q->lock_ptr = &hb->lock; - spin_lock(&hb->lock); /* implies MB (A) */ + spin_lock(&hb->lock); /* implies smp_mb(); (A) */ return hb; } @@ -1929,8 +2010,12 @@ static int unqueue_me(struct futex_q *q) /* In the common case we don't take the spinlock, which is nice. */ retry: - lock_ptr = q->lock_ptr; - barrier(); + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); if (lock_ptr != NULL) { spin_lock(lock_ptr); /* @@ -1973,7 +2058,7 @@ static void unqueue_me_pi(struct futex_q *q) __unqueue_futex(q); BUG_ON(!q->pi_state); - free_pi_state(q->pi_state); + put_pi_state(q->pi_state); q->pi_state = NULL; spin_unlock(q->lock_ptr); @@ -2129,11 +2214,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * we returned due to timeout or signal without taking the * rt_mutex. Too late. */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); if (!owner) owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } @@ -2755,6 +2840,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, if (q.pi_state && (q.pi_state->owner != current)) { spin_lock(q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. + */ + put_pi_state(q.pi_state); spin_unlock(q.lock_ptr); } } else { @@ -2881,7 +2971,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->robust_list; @@ -3046,7 +3136,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (op & FUTEX_CLOCK_REALTIME) { flags |= FLAGS_CLOCKRT; - if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) + if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \ + cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; } diff --git a/kernel/futex_compat.c b/kernel/futex_compat.c index 55c8c9349cfe..4ae3232e7a28 100644 --- a/kernel/futex_compat.c +++ b/kernel/futex_compat.c @@ -155,7 +155,7 @@ COMPAT_SYSCALL_DEFINE3(get_robust_list, int, pid, } ret = -EPERM; - if (!ptrace_may_access(p, PTRACE_MODE_READ)) + if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS)) goto err_unlock; head = p->compat_robust_list; diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 7080ae1eb6c1..2f9df37940a0 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -123,11 +123,6 @@ void gcov_enable_events(void) } #ifdef CONFIG_MODULES -static inline int within(void *addr, void *start, unsigned long size) -{ - return ((addr >= start) && (addr < start + size)); -} - /* Update list and generate events when modules are unloaded. */ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, void *data) @@ -142,7 +137,7 @@ static int gcov_module_notifier(struct notifier_block *nb, unsigned long event, /* Remove entries located in module from linked list. */ while ((info = gcov_info_next(info))) { - if (within(info, mod->module_core, mod->core_size)) { + if (within_module((unsigned long)info, mod)) { gcov_info_unlink(prev, info); if (gcov_events_enabled) gcov_event(GCOV_REMOVE, info); diff --git a/kernel/hung_task.c b/kernel/hung_task.c index e0f90c2b57aa..d234022805dc 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -185,10 +185,12 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); } -static unsigned long timeout_jiffies(unsigned long timeout) +static long hung_timeout_jiffies(unsigned long last_checked, + unsigned long timeout) { /* timeout of 0 will disable the watchdog */ - return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT; + return timeout ? last_checked - jiffies + timeout * HZ : + MAX_SCHEDULE_TIMEOUT; } /* @@ -224,18 +226,21 @@ EXPORT_SYMBOL_GPL(reset_hung_task_detector); */ static int watchdog(void *dummy) { + unsigned long hung_last_checked = jiffies; + set_user_nice(current, 0); for ( ; ; ) { unsigned long timeout = sysctl_hung_task_timeout_secs; + long t = hung_timeout_jiffies(hung_last_checked, timeout); - while (schedule_timeout_interruptible(timeout_jiffies(timeout))) - timeout = sysctl_hung_task_timeout_secs; - - if (atomic_xchg(&reset_hung_task, 0)) + if (t <= 0) { + if (!atomic_xchg(&reset_hung_task, 0)) + check_hung_uninterruptible_tasks(timeout); + hung_last_checked = jiffies; continue; - - check_hung_uninterruptible_tasks(timeout); + } + schedule_timeout_interruptible(t); } return 0; diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig index 3b48dab80164..3bbfd6a9c475 100644 --- a/kernel/irq/Kconfig +++ b/kernel/irq/Kconfig @@ -64,6 +64,10 @@ config IRQ_DOMAIN_HIERARCHY bool select IRQ_DOMAIN +# Generic IRQ IPI support +config GENERIC_IRQ_IPI + bool + # Generic MSI interrupt support config GENERIC_MSI_IRQ bool diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile index 2fc9cbdf35b6..2ee42e95a3ce 100644 --- a/kernel/irq/Makefile +++ b/kernel/irq/Makefile @@ -8,3 +8,4 @@ obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o obj-$(CONFIG_GENERIC_IRQ_MIGRATION) += cpuhotplug.o obj-$(CONFIG_PM_SLEEP) += pm.o obj-$(CONFIG_GENERIC_MSI_IRQ) += msi.o +obj-$(CONFIG_GENERIC_IRQ_IPI) += ipi.o diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c index 15206453b12a..2f9f2b0e79f2 100644 --- a/kernel/irq/chip.c +++ b/kernel/irq/chip.c @@ -338,7 +338,6 @@ void handle_nested_irq(unsigned int irq) raw_spin_lock_irq(&desc->lock); desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); action = desc->action; if (unlikely(!action || irqd_irq_disabled(&desc->irq_data))) { @@ -346,6 +345,7 @@ void handle_nested_irq(unsigned int irq) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); irqd_set(&desc->irq_data, IRQD_IRQ_INPROGRESS); raw_spin_unlock_irq(&desc->lock); @@ -412,13 +412,13 @@ void handle_simple_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); if (unlikely(!desc->action || irqd_irq_disabled(&desc->irq_data))) { desc->istate |= IRQS_PENDING; goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); out_unlock: @@ -462,7 +462,6 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -473,6 +472,7 @@ void handle_level_irq(struct irq_desc *desc) goto out_unlock; } + kstat_incr_irqs_this_cpu(desc); handle_irq_event(desc); cond_unmask_irq(desc); @@ -532,7 +532,6 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; desc->istate &= ~(IRQS_REPLAY | IRQS_WAITING); - kstat_incr_irqs_this_cpu(desc); /* * If its disabled or no action available @@ -544,6 +543,7 @@ void handle_fasteoi_irq(struct irq_desc *desc) goto out; } + kstat_incr_irqs_this_cpu(desc); if (desc->istate & IRQS_ONESHOT) mask_irq(desc); @@ -950,6 +950,7 @@ void irq_chip_ack_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_ack(data); } +EXPORT_SYMBOL_GPL(irq_chip_ack_parent); /** * irq_chip_mask_parent - Mask the parent interrupt @@ -960,6 +961,7 @@ void irq_chip_mask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_mask(data); } +EXPORT_SYMBOL_GPL(irq_chip_mask_parent); /** * irq_chip_unmask_parent - Unmask the parent interrupt @@ -970,6 +972,7 @@ void irq_chip_unmask_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_unmask(data); } +EXPORT_SYMBOL_GPL(irq_chip_unmask_parent); /** * irq_chip_eoi_parent - Invoke EOI on the parent interrupt @@ -980,6 +983,7 @@ void irq_chip_eoi_parent(struct irq_data *data) data = data->parent_data; data->chip->irq_eoi(data); } +EXPORT_SYMBOL_GPL(irq_chip_eoi_parent); /** * irq_chip_set_affinity_parent - Set affinity on the parent interrupt @@ -1015,6 +1019,7 @@ int irq_chip_set_type_parent(struct irq_data *data, unsigned int type) return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_chip_set_type_parent); /** * irq_chip_retrigger_hierarchy - Retrigger an interrupt in hardware diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a302cf9a2126..a15b5485b446 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -136,9 +136,9 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval = IRQ_NONE; unsigned int flags = 0, irq = desc->irq_data.irq; - struct irqaction *action = desc->action; + struct irqaction *action; - do { + for_each_action_of_desc(desc, action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -172,8 +172,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) } retval |= res; - action = action->next; - } while (action); + } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h index 05c2188271b8..09be2c903c6d 100644 --- a/kernel/irq/internals.h +++ b/kernel/irq/internals.h @@ -131,6 +131,9 @@ static inline void chip_bus_sync_unlock(struct irq_desc *desc) #define IRQ_GET_DESC_CHECK_GLOBAL (_IRQ_DESC_CHECK) #define IRQ_GET_DESC_CHECK_PERCPU (_IRQ_DESC_CHECK | _IRQ_DESC_PERCPU) +#define for_each_action_of_desc(desc, act) \ + for (act = desc->act; act; act = act->next) + struct irq_desc * __irq_get_desc_lock(unsigned int irq, unsigned long *flags, bool bus, unsigned int check); @@ -160,6 +163,8 @@ irq_put_desc_unlock(struct irq_desc *desc, unsigned long flags) __irq_put_desc_unlock(desc, flags, false); } +#define __irqd_to_state(d) ACCESS_PRIVATE((d)->common, state_use_accessors) + /* * Manipulation functions for irq_data.state */ @@ -188,6 +193,8 @@ static inline bool irqd_has_set(struct irq_data *d, unsigned int mask) return __irqd_to_state(d) & mask; } +#undef __irqd_to_state + static inline void kstat_incr_irqs_this_cpu(struct irq_desc *desc) { __this_cpu_inc(*desc->kstat_irqs); @@ -199,6 +206,11 @@ static inline int irq_desc_get_node(struct irq_desc *desc) return irq_common_data_get_node(&desc->irq_common_data); } +static inline int irq_desc_is_chained(struct irq_desc *desc) +{ + return (desc->action && desc->action == &chained_action); +} + #ifdef CONFIG_PM_SLEEP bool irq_pm_check_wakeup(struct irq_desc *desc); void irq_pm_install_action(struct irq_desc *desc, struct irqaction *action); diff --git a/kernel/irq/ipi.c b/kernel/irq/ipi.c new file mode 100644 index 000000000000..c37f34b00a11 --- /dev/null +++ b/kernel/irq/ipi.c @@ -0,0 +1,326 @@ +/* + * linux/kernel/irq/ipi.c + * + * Copyright (C) 2015 Imagination Technologies Ltd + * Author: Qais Yousef <qais.yousef@imgtec.com> + * + * This file contains driver APIs to the IPI subsystem. + */ + +#define pr_fmt(fmt) "genirq/ipi: " fmt + +#include <linux/irqdomain.h> +#include <linux/irq.h> + +/** + * irq_reserve_ipi() - Setup an IPI to destination cpumask + * @domain: IPI domain + * @dest: cpumask of cpus which can receive the IPI + * + * Allocate a virq that can be used to send IPI to any CPU in dest mask. + * + * On success it'll return linux irq number and 0 on failure + */ +unsigned int irq_reserve_ipi(struct irq_domain *domain, + const struct cpumask *dest) +{ + unsigned int nr_irqs, offset; + struct irq_data *data; + int virq, i; + + if (!domain ||!irq_domain_is_ipi(domain)) { + pr_warn("Reservation on a non IPI domain\n"); + return 0; + } + + if (!cpumask_subset(dest, cpu_possible_mask)) { + pr_warn("Reservation is not in possible_cpu_mask\n"); + return 0; + } + + nr_irqs = cpumask_weight(dest); + if (!nr_irqs) { + pr_warn("Reservation for empty destination mask\n"); + return 0; + } + + if (irq_domain_is_ipi_single(domain)) { + /* + * If the underlying implementation uses a single HW irq on + * all cpus then we only need a single Linux irq number for + * it. We have no restrictions vs. the destination mask. The + * underlying implementation can deal with holes nicely. + */ + nr_irqs = 1; + offset = 0; + } else { + unsigned int next; + + /* + * The IPI requires a seperate HW irq on each CPU. We require + * that the destination mask is consecutive. If an + * implementation needs to support holes, it can reserve + * several IPI ranges. + */ + offset = cpumask_first(dest); + /* + * Find a hole and if found look for another set bit after the + * hole. For now we don't support this scenario. + */ + next = cpumask_next_zero(offset, dest); + if (next < nr_cpu_ids) + next = cpumask_next(next, dest); + if (next < nr_cpu_ids) { + pr_warn("Destination mask has holes\n"); + return 0; + } + } + + virq = irq_domain_alloc_descs(-1, nr_irqs, 0, NUMA_NO_NODE); + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc descs\n"); + return 0; + } + + virq = __irq_domain_alloc_irqs(domain, virq, nr_irqs, NUMA_NO_NODE, + (void *) dest, true); + + if (virq <= 0) { + pr_warn("Can't reserve IPI, failed to alloc hw irqs\n"); + goto free_descs; + } + + for (i = 0; i < nr_irqs; i++) { + data = irq_get_irq_data(virq + i); + cpumask_copy(data->common->affinity, dest); + data->common->ipi_offset = offset; + } + return virq; + +free_descs: + irq_free_descs(virq, nr_irqs); + return 0; +} + +/** + * irq_destroy_ipi() - unreserve an IPI that was previously allocated + * @irq: linux irq number to be destroyed + * + * Return the IPIs allocated with irq_reserve_ipi() to the system destroying + * all virqs associated with them. + */ +void irq_destroy_ipi(unsigned int irq) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + struct irq_domain *domain; + unsigned int nr_irqs; + + if (!irq || !data || !ipimask) + return; + + domain = data->domain; + if (WARN_ON(domain == NULL)) + return; + + if (!irq_domain_is_ipi(domain)) { + pr_warn("Trying to destroy a non IPI domain!\n"); + return; + } + + if (irq_domain_is_ipi_per_cpu(domain)) + nr_irqs = cpumask_weight(ipimask); + else + nr_irqs = 1; + + irq_domain_free_irqs(irq, nr_irqs); +} + +/** + * ipi_get_hwirq - Get the hwirq associated with an IPI to a cpu + * @irq: linux irq number + * @cpu: the target cpu + * + * When dealing with coprocessors IPI, we need to inform the coprocessor of + * the hwirq it needs to use to receive and send IPIs. + * + * Returns hwirq value on success and INVALID_HWIRQ on failure. + */ +irq_hw_number_t ipi_get_hwirq(unsigned int irq, unsigned int cpu) +{ + struct irq_data *data = irq_get_irq_data(irq); + struct cpumask *ipimask = data ? irq_data_get_affinity_mask(data) : NULL; + + if (!data || !ipimask || cpu > nr_cpu_ids) + return INVALID_HWIRQ; + + if (!cpumask_test_cpu(cpu, ipimask)) + return INVALID_HWIRQ; + + /* + * Get the real hardware irq number if the underlying implementation + * uses a seperate irq per cpu. If the underlying implementation uses + * a single hardware irq for all cpus then the IPI send mechanism + * needs to take care of the cpu destinations. + */ + if (irq_domain_is_ipi_per_cpu(data->domain)) + data = irq_get_irq_data(irq + cpu - data->common->ipi_offset); + + return data ? irqd_to_hwirq(data) : INVALID_HWIRQ; +} +EXPORT_SYMBOL_GPL(ipi_get_hwirq); + +static int ipi_send_verify(struct irq_chip *chip, struct irq_data *data, + const struct cpumask *dest, unsigned int cpu) +{ + struct cpumask *ipimask = irq_data_get_affinity_mask(data); + + if (!chip || !ipimask) + return -EINVAL; + + if (!chip->ipi_send_single && !chip->ipi_send_mask) + return -EINVAL; + + if (cpu > nr_cpu_ids) + return -EINVAL; + + if (dest) { + if (!cpumask_subset(dest, ipimask)) + return -EINVAL; + } else { + if (!cpumask_test_cpu(cpu, ipimask)) + return -EINVAL; + } + return 0; +} + +/** + * __ipi_send_single - send an IPI to a target Linux SMP CPU + * @desc: pointer to irq_desc of the IRQ + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_single(struct irq_desc *desc, unsigned int cpu) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; +#endif + if (!chip->ipi_send_single) { + chip->ipi_send_mask(data, cpumask_of(cpu)); + return 0; + } + + /* FIXME: Store this information in irqdata flags */ + if (irq_domain_is_ipi_per_cpu(data->domain) && + cpu != data->common->ipi_offset) { + /* use the correct data for that cpu */ + unsigned irq = data->irq + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + } + chip->ipi_send_single(data, cpu); + return 0; +} + +/** + * ipi_send_mask - send an IPI to target Linux SMP CPU(s) + * @desc: pointer to irq_desc of the IRQ + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * This function is for architecture or core code to speed up IPI sending. Not + * usable from driver code. + * + * Returns zero on success and negative error number on failure. + */ +int __ipi_send_mask(struct irq_desc *desc, const struct cpumask *dest) +{ + struct irq_data *data = irq_desc_get_irq_data(desc); + struct irq_chip *chip = irq_data_get_irq_chip(data); + unsigned int cpu; + +#ifdef DEBUG + /* + * Minimise the overhead by omitting the checks for Linux SMP IPIs. + * Since the callers should be arch or core code which is generally + * trusted, only check for errors when debugging. + */ + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; +#endif + if (chip->ipi_send_mask) { + chip->ipi_send_mask(data, dest); + return 0; + } + + if (irq_domain_is_ipi_per_cpu(data->domain)) { + unsigned int base = data->irq; + + for_each_cpu(cpu, dest) { + unsigned irq = base + cpu - data->common->ipi_offset; + + data = irq_get_irq_data(irq); + chip->ipi_send_single(data, cpu); + } + } else { + for_each_cpu(cpu, dest) + chip->ipi_send_single(data, cpu); + } + return 0; +} + +/** + * ipi_send_single - Send an IPI to a single CPU + * @virq: linux irq number from irq_reserve_ipi() + * @cpu: destination CPU, must in the destination mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_single(unsigned int virq, unsigned int cpu) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, NULL, cpu))) + return -EINVAL; + + return __ipi_send_single(desc, cpu); +} +EXPORT_SYMBOL_GPL(ipi_send_single); + +/** + * ipi_send_mask - Send an IPI to target CPU(s) + * @virq: linux irq number from irq_reserve_ipi() + * @dest: dest CPU(s), must be a subset of the mask passed to + * irq_reserve_ipi() + * + * Returns zero on success and negative error number on failure. + */ +int ipi_send_mask(unsigned int virq, const struct cpumask *dest) +{ + struct irq_desc *desc = irq_to_desc(virq); + struct irq_data *data = desc ? irq_desc_get_irq_data(desc) : NULL; + struct irq_chip *chip = data ? irq_data_get_irq_chip(data) : NULL; + + if (WARN_ON_ONCE(ipi_send_verify(chip, data, dest, 0))) + return -EINVAL; + + return __ipi_send_mask(desc, dest); +} +EXPORT_SYMBOL_GPL(ipi_send_mask); diff --git a/kernel/irq/irqdesc.c b/kernel/irq/irqdesc.c index 239e2ae2c947..0ccd028817d7 100644 --- a/kernel/irq/irqdesc.c +++ b/kernel/irq/irqdesc.c @@ -24,10 +24,27 @@ static struct lock_class_key irq_desc_lock_class; #if defined(CONFIG_SMP) +static int __init irq_affinity_setup(char *str) +{ + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); + cpulist_parse(str, irq_default_affinity); + /* + * Set at least the boot cpu. We don't want to end up with + * bugreports caused by random comandline masks + */ + cpumask_set_cpu(smp_processor_id(), irq_default_affinity); + return 1; +} +__setup("irqaffinity=", irq_affinity_setup); + static void __init init_irq_default_affinity(void) { - alloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); - cpumask_setall(irq_default_affinity); +#ifdef CONFIG_CPUMASK_OFFSTACK + if (!irq_default_affinity) + zalloc_cpumask_var(&irq_default_affinity, GFP_NOWAIT); +#endif + if (cpumask_empty(irq_default_affinity)) + cpumask_setall(irq_default_affinity); } #else static void __init init_irq_default_affinity(void) @@ -159,6 +176,7 @@ static struct irq_desc *alloc_desc(int irq, int node, struct module *owner) raw_spin_lock_init(&desc->lock); lockdep_set_class(&desc->lock, &irq_desc_lock_class); + init_rcu_head(&desc->rcu); desc_set_defaults(irq, desc, node, owner); @@ -171,6 +189,15 @@ err_desc: return NULL; } +static void delayed_free_desc(struct rcu_head *rhp) +{ + struct irq_desc *desc = container_of(rhp, struct irq_desc, rcu); + + free_masks(desc); + free_percpu(desc->kstat_irqs); + kfree(desc); +} + static void free_desc(unsigned int irq) { struct irq_desc *desc = irq_to_desc(irq); @@ -187,9 +214,12 @@ static void free_desc(unsigned int irq) delete_irq_desc(irq); mutex_unlock(&sparse_irq_lock); - free_masks(desc); - free_percpu(desc->kstat_irqs); - kfree(desc); + /* + * We free the descriptor, masks and stat fields via RCU. That + * allows demultiplex interrupts to do rcu based management of + * the child interrupts. + */ + call_rcu(&desc->rcu, delayed_free_desc); } static int alloc_descs(unsigned int start, unsigned int cnt, int node, diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 22aa9612ef7c..3a519a01118b 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -23,8 +23,6 @@ static DEFINE_MUTEX(irq_domain_mutex); static DEFINE_MUTEX(revmap_trees_mutex); static struct irq_domain *irq_default_domain; -static int irq_domain_alloc_descs(int virq, unsigned int nr_irqs, - irq_hw_number_t hwirq, int node); static void irq_domain_check_hierarchy(struct irq_domain *domain); struct irqchip_fwid { @@ -60,6 +58,7 @@ struct fwnode_handle *irq_domain_alloc_fwnode(void *data) fwid->fwnode.type = FWNODE_IRQCHIP; return &fwid->fwnode; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_fwnode); /** * irq_domain_free_fwnode - Free a non-OF-backed fwnode_handle @@ -70,13 +69,14 @@ void irq_domain_free_fwnode(struct fwnode_handle *fwnode) { struct irqchip_fwid *fwid; - if (WARN_ON(fwnode->type != FWNODE_IRQCHIP)) + if (WARN_ON(!is_fwnode_irqchip(fwnode))) return; fwid = container_of(fwnode, struct irqchip_fwid, fwnode); kfree(fwid->name); kfree(fwid); } +EXPORT_SYMBOL_GPL(irq_domain_free_fwnode); /** * __irq_domain_add() - Allocate a new irq_domain data structure @@ -573,10 +573,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) unsigned int type = IRQ_TYPE_NONE; int virq; - if (fwspec->fwnode) - domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); - else + if (fwspec->fwnode) { + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_WIRED); + if (!domain) + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_ANY); + } else { domain = irq_default_domain; + } if (!domain) { pr_warn("no irq domain found for %s !\n", @@ -833,8 +838,8 @@ const struct irq_domain_ops irq_domain_simple_ops = { }; EXPORT_SYMBOL_GPL(irq_domain_simple_ops); -static int irq_domain_alloc_descs(int virq, unsigned int cnt, - irq_hw_number_t hwirq, int node) +int irq_domain_alloc_descs(int virq, unsigned int cnt, irq_hw_number_t hwirq, + int node) { unsigned int hint; @@ -888,6 +893,7 @@ struct irq_domain *irq_domain_create_hierarchy(struct irq_domain *parent, return domain; } +EXPORT_SYMBOL_GPL(irq_domain_create_hierarchy); static void irq_domain_insert_irq(int virq) { @@ -1013,6 +1019,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, return NULL; } +EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_hwirq_and_chip - Set hwirq and irqchip of @virq at @domain @@ -1037,6 +1044,7 @@ int irq_domain_set_hwirq_and_chip(struct irq_domain *domain, unsigned int virq, return 0; } +EXPORT_SYMBOL_GPL(irq_domain_set_hwirq_and_chip); /** * irq_domain_set_info - Set the complete data for a @virq in @domain @@ -1058,6 +1066,7 @@ void irq_domain_set_info(struct irq_domain *domain, unsigned int virq, __irq_set_handler(virq, handler, 0, handler_name); irq_set_handler_data(virq, handler_data); } +EXPORT_SYMBOL(irq_domain_set_info); /** * irq_domain_reset_irq_data - Clear hwirq, chip and chip_data in @irq_data @@ -1069,6 +1078,7 @@ void irq_domain_reset_irq_data(struct irq_data *irq_data) irq_data->chip = &no_irq_chip; irq_data->chip_data = NULL; } +EXPORT_SYMBOL_GPL(irq_domain_reset_irq_data); /** * irq_domain_free_irqs_common - Clear irq_data and free the parent @@ -1125,9 +1135,9 @@ static void irq_domain_free_irqs_recursive(struct irq_domain *domain, } } -static int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, - unsigned int irq_base, - unsigned int nr_irqs, void *arg) +int irq_domain_alloc_irqs_recursive(struct irq_domain *domain, + unsigned int irq_base, + unsigned int nr_irqs, void *arg) { int ret = 0; struct irq_domain *parent = domain->parent; @@ -1266,6 +1276,7 @@ int irq_domain_alloc_irqs_parent(struct irq_domain *domain, nr_irqs, arg); return -ENOSYS; } +EXPORT_SYMBOL_GPL(irq_domain_alloc_irqs_parent); /** * irq_domain_free_irqs_parent - Free interrupts from parent domain @@ -1283,6 +1294,7 @@ void irq_domain_free_irqs_parent(struct irq_domain *domain, irq_domain_free_irqs_recursive(domain->parent, irq_base, nr_irqs); } +EXPORT_SYMBOL_GPL(irq_domain_free_irqs_parent); /** * irq_domain_activate_irq - Call domain_ops->activate recursively to activate @@ -1343,6 +1355,7 @@ struct irq_data *irq_domain_get_irq_data(struct irq_domain *domain, return (irq_data && irq_data->domain == domain) ? irq_data : NULL; } +EXPORT_SYMBOL_GPL(irq_domain_get_irq_data); /** * irq_domain_set_info - Set the complete data for a @virq in @domain diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index 0eebaeef317b..cc1cc641d653 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -144,13 +144,11 @@ int irq_can_set_affinity(unsigned int irq) */ void irq_set_thread_affinity(struct irq_desc *desc) { - struct irqaction *action = desc->action; + struct irqaction *action; - while (action) { + for_each_action_of_desc(desc, action) if (action->thread) set_bit(IRQTF_AFFINITY, &action->thread_flags); - action = action->next; - } } #ifdef CONFIG_GENERIC_PENDING_IRQ @@ -994,7 +992,7 @@ void irq_wake_thread(unsigned int irq, void *dev_id) return; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action; action; action = action->next) { + for_each_action_of_desc(desc, action) { if (action->dev_id == dev_id) { if (action->thread) __irq_wake_thread(desc, action); @@ -1324,8 +1322,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new) if (nmsk != omsk) /* hope the handler works with current trigger mode */ - pr_warning("irq %d uses trigger mode %u; requested %u\n", - irq, nmsk, omsk); + pr_warn("irq %d uses trigger mode %u; requested %u\n", + irq, nmsk, omsk); } *old_ptr = new; @@ -1434,6 +1432,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!desc) return NULL; + chip_bus_lock(desc); raw_spin_lock_irqsave(&desc->lock, flags); /* @@ -1447,7 +1446,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) if (!action) { WARN(1, "Trying to free already-free IRQ %d\n", irq); raw_spin_unlock_irqrestore(&desc->lock, flags); - + chip_bus_sync_unlock(desc); return NULL; } @@ -1475,6 +1474,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id) #endif raw_spin_unlock_irqrestore(&desc->lock, flags); + chip_bus_sync_unlock(desc); unregister_handler_proc(irq, action); @@ -1553,9 +1553,7 @@ void free_irq(unsigned int irq, void *dev_id) desc->affinity_notify = NULL; #endif - chip_bus_lock(desc); kfree(__free_irq(irq, dev_id)); - chip_bus_sync_unlock(desc); } EXPORT_SYMBOL(free_irq); @@ -1609,6 +1607,9 @@ int request_threaded_irq(unsigned int irq, irq_handler_t handler, struct irq_desc *desc; int retval; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + /* * Sanity-check: shared interrupts must pass in a real dev-ID, * otherwise we'll have trouble later trying to figure out @@ -1699,9 +1700,13 @@ EXPORT_SYMBOL(request_threaded_irq); int request_any_context_irq(unsigned int irq, irq_handler_t handler, unsigned long flags, const char *name, void *dev_id) { - struct irq_desc *desc = irq_to_desc(irq); + struct irq_desc *desc; int ret; + if (irq == IRQ_NOTCONNECTED) + return -ENOTCONN; + + desc = irq_to_desc(irq); if (!desc) return -EINVAL; @@ -1743,6 +1748,31 @@ out: } EXPORT_SYMBOL_GPL(enable_percpu_irq); +/** + * irq_percpu_is_enabled - Check whether the per cpu irq is enabled + * @irq: Linux irq number to check for + * + * Must be called from a non migratable context. Returns the enable + * state of a per cpu interrupt on the current cpu. + */ +bool irq_percpu_is_enabled(unsigned int irq) +{ + unsigned int cpu = smp_processor_id(); + struct irq_desc *desc; + unsigned long flags; + bool is_enabled; + + desc = irq_get_desc_lock(irq, &flags, IRQ_GET_DESC_CHECK_PERCPU); + if (!desc) + return false; + + is_enabled = cpumask_test_cpu(cpu, desc->percpu_enabled); + irq_put_desc_unlock(desc, flags); + + return is_enabled; +} +EXPORT_SYMBOL_GPL(irq_percpu_is_enabled); + void disable_percpu_irq(unsigned int irq) { unsigned int cpu = smp_processor_id(); diff --git a/kernel/irq/msi.c b/kernel/irq/msi.c index 6b0c0b74a2a1..38e89ce7b071 100644 --- a/kernel/irq/msi.c +++ b/kernel/irq/msi.c @@ -109,9 +109,11 @@ static int msi_domain_alloc(struct irq_domain *domain, unsigned int virq, if (irq_find_mapping(domain, hwirq) > 0) return -EEXIST; - ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); - if (ret < 0) - return ret; + if (domain->parent) { + ret = irq_domain_alloc_irqs_parent(domain, virq, nr_irqs, arg); + if (ret < 0) + return ret; + } for (i = 0; i < nr_irqs; i++) { ret = ops->msi_init(domain, info, virq + i, hwirq + i, arg); @@ -252,6 +254,60 @@ struct irq_domain *msi_create_irq_domain(struct fwnode_handle *fwnode, &msi_domain_ops, info); } +int msi_domain_prepare_irqs(struct irq_domain *domain, struct device *dev, + int nvec, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + int ret; + + ret = ops->msi_check(domain, info, dev); + if (ret == 0) + ret = ops->msi_prepare(domain, dev, nvec, arg); + + return ret; +} + +int msi_domain_populate_irqs(struct irq_domain *domain, struct device *dev, + int virq, int nvec, msi_alloc_info_t *arg) +{ + struct msi_domain_info *info = domain->host_data; + struct msi_domain_ops *ops = info->ops; + struct msi_desc *desc; + int ret = 0; + + for_each_msi_entry(desc, dev) { + /* Don't even try the multi-MSI brain damage. */ + if (WARN_ON(!desc->irq || desc->nvec_used != 1)) { + ret = -EINVAL; + break; + } + + if (!(desc->irq >= virq && desc->irq < (virq + nvec))) + continue; + + ops->set_desc(arg, desc); + /* Assumes the domain mutex is held! */ + ret = irq_domain_alloc_irqs_recursive(domain, virq, 1, arg); + if (ret) + break; + + irq_set_msi_desc_off(virq, 0, desc); + } + + if (ret) { + /* Mop up the damage */ + for_each_msi_entry(desc, dev) { + if (!(desc->irq >= virq && desc->irq < (virq + nvec))) + continue; + + irq_domain_free_irqs_common(domain, desc->irq, 1); + } + } + + return ret; +} + /** * msi_domain_alloc_irqs - Allocate interrupts from a MSI interrupt domain * @domain: The domain to allocate from @@ -270,9 +326,7 @@ int msi_domain_alloc_irqs(struct irq_domain *domain, struct device *dev, struct msi_desc *desc; int i, ret, virq = -1; - ret = ops->msi_check(domain, info, dev); - if (ret == 0) - ret = ops->msi_prepare(domain, dev, nvec, &arg); + ret = msi_domain_prepare_irqs(domain, dev, nvec, &arg); if (ret) return ret; diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c index e80c4400118a..cea1de0161f1 100644 --- a/kernel/irq/pm.c +++ b/kernel/irq/pm.c @@ -70,7 +70,8 @@ void irq_pm_remove_action(struct irq_desc *desc, struct irqaction *action) static bool suspend_device_irq(struct irq_desc *desc) { - if (!desc->action || desc->no_suspend_depth) + if (!desc->action || irq_desc_is_chained(desc) || + desc->no_suspend_depth) return false; if (irqd_is_wakeup_set(&desc->irq_data)) { diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c index a916cf144b65..4e1b94726818 100644 --- a/kernel/irq/proc.c +++ b/kernel/irq/proc.c @@ -291,7 +291,7 @@ static int name_unique(unsigned int irq, struct irqaction *new_action) int ret = 1; raw_spin_lock_irqsave(&desc->lock, flags); - for (action = desc->action ; action; action = action->next) { + for_each_action_of_desc(desc, action) { if ((action != new_action) && action->name && !strcmp(new_action->name, action->name)) { ret = 0; @@ -475,7 +475,7 @@ int show_interrupts(struct seq_file *p, void *v) for_each_online_cpu(j) any_count |= kstat_irqs_cpu(i, j); action = desc->action; - if ((!action || action == &chained_action) && !any_count) + if ((!action || irq_desc_is_chained(desc)) && !any_count) goto out; seq_printf(p, "%*d: ", prec, i); diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index 32144175458d..5707f97a3e6a 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -211,14 +211,12 @@ static void __report_bad_irq(struct irq_desc *desc, irqreturn_t action_ret) * desc->lock here. See synchronize_irq(). */ raw_spin_lock_irqsave(&desc->lock, flags); - action = desc->action; - while (action) { + for_each_action_of_desc(desc, action) { printk(KERN_ERR "[<%p>] %pf", action->handler, action->handler); if (action->thread_fn) printk(KERN_CONT " threaded [<%p>] %pf", action->thread_fn, action->thread_fn); printk(KERN_CONT "\n"); - action = action->next; } raw_spin_unlock_irqrestore(&desc->lock, flags); } diff --git a/kernel/irq_work.c b/kernel/irq_work.c index cbf9fb899d92..bcf107ce0854 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra * * Provides a framework for enqueueing and running callbacks from hardirq * context. The enqueueing is NMI-safe. diff --git a/kernel/jump_label.c b/kernel/jump_label.c index f7dd15d537f9..05254eeb4b4e 100644 --- a/kernel/jump_label.c +++ b/kernel/jump_label.c @@ -2,7 +2,7 @@ * jump label support * * Copyright (C) 2009 Jason Baron <jbaron@redhat.com> - * Copyright (C) 2011 Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2011 Peter Zijlstra * */ #include <linux/memory.h> diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c index 5c5987f10819..fafd1a3ef0da 100644 --- a/kernel/kallsyms.c +++ b/kernel/kallsyms.c @@ -38,6 +38,7 @@ * during the second link stage. */ extern const unsigned long kallsyms_addresses[] __weak; +extern const int kallsyms_offsets[] __weak; extern const u8 kallsyms_names[] __weak; /* @@ -47,6 +48,9 @@ extern const u8 kallsyms_names[] __weak; extern const unsigned long kallsyms_num_syms __attribute__((weak, section(".rodata"))); +extern const unsigned long kallsyms_relative_base +__attribute__((weak, section(".rodata"))); + extern const u8 kallsyms_token_table[] __weak; extern const u16 kallsyms_token_index[] __weak; @@ -176,6 +180,23 @@ static unsigned int get_symbol_offset(unsigned long pos) return name - kallsyms_names; } +static unsigned long kallsyms_sym_address(int idx) +{ + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + return kallsyms_addresses[idx]; + + /* values are unsigned offsets if --absolute-percpu is not in effect */ + if (!IS_ENABLED(CONFIG_KALLSYMS_ABSOLUTE_PERCPU)) + return kallsyms_relative_base + (u32)kallsyms_offsets[idx]; + + /* ...otherwise, positive offsets are absolute values */ + if (kallsyms_offsets[idx] >= 0) + return kallsyms_offsets[idx]; + + /* ...and negative offsets are relative to kallsyms_relative_base - 1 */ + return kallsyms_relative_base - 1 - kallsyms_offsets[idx]; +} + /* Lookup the address for this symbol. Returns 0 if not found. */ unsigned long kallsyms_lookup_name(const char *name) { @@ -187,7 +208,7 @@ unsigned long kallsyms_lookup_name(const char *name) off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); if (strcmp(namebuf, name) == 0) - return kallsyms_addresses[i]; + return kallsyms_sym_address(i); } return module_kallsyms_lookup_name(name); } @@ -204,7 +225,7 @@ int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *, for (i = 0, off = 0; i < kallsyms_num_syms; i++) { off = kallsyms_expand_symbol(off, namebuf, ARRAY_SIZE(namebuf)); - ret = fn(data, namebuf, NULL, kallsyms_addresses[i]); + ret = fn(data, namebuf, NULL, kallsyms_sym_address(i)); if (ret != 0) return ret; } @@ -220,7 +241,10 @@ static unsigned long get_symbol_pos(unsigned long addr, unsigned long i, low, high, mid; /* This kernel should never had been booted. */ - BUG_ON(!kallsyms_addresses); + if (!IS_ENABLED(CONFIG_KALLSYMS_BASE_RELATIVE)) + BUG_ON(!kallsyms_addresses); + else + BUG_ON(!kallsyms_offsets); /* Do a binary search on the sorted kallsyms_addresses array. */ low = 0; @@ -228,7 +252,7 @@ static unsigned long get_symbol_pos(unsigned long addr, while (high - low > 1) { mid = low + (high - low) / 2; - if (kallsyms_addresses[mid] <= addr) + if (kallsyms_sym_address(mid) <= addr) low = mid; else high = mid; @@ -238,15 +262,15 @@ static unsigned long get_symbol_pos(unsigned long addr, * Search for the first aliased symbol. Aliased * symbols are symbols with the same address. */ - while (low && kallsyms_addresses[low-1] == kallsyms_addresses[low]) + while (low && kallsyms_sym_address(low-1) == kallsyms_sym_address(low)) --low; - symbol_start = kallsyms_addresses[low]; + symbol_start = kallsyms_sym_address(low); /* Search for next non-aliased symbol. */ for (i = low + 1; i < kallsyms_num_syms; i++) { - if (kallsyms_addresses[i] > symbol_start) { - symbol_end = kallsyms_addresses[i]; + if (kallsyms_sym_address(i) > symbol_start) { + symbol_end = kallsyms_sym_address(i); break; } } @@ -470,7 +494,7 @@ static unsigned long get_ksymbol_core(struct kallsym_iter *iter) unsigned off = iter->nameoff; iter->module_name[0] = '\0'; - iter->value = kallsyms_addresses[iter->pos]; + iter->value = kallsyms_sym_address(iter->pos); iter->type = kallsyms_get_symbol_type(off); diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 0aa69ea1d8fd..3a47fa998fe0 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -122,8 +122,8 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, &task2->signal->cred_guard_mutex); if (ret) goto err; - if (!ptrace_may_access(task1, PTRACE_MODE_READ) || - !ptrace_may_access(task2, PTRACE_MODE_READ)) { + if (!ptrace_may_access(task1, PTRACE_MODE_READ_REALCREDS) || + !ptrace_may_access(task2, PTRACE_MODE_READ_REALCREDS)) { ret = -EPERM; goto err_unlock; } diff --git a/kernel/kcov.c b/kernel/kcov.c new file mode 100644 index 000000000000..3efbee0834a8 --- /dev/null +++ b/kernel/kcov.c @@ -0,0 +1,273 @@ +#define pr_fmt(fmt) "kcov: " fmt + +#include <linux/compiler.h> +#include <linux/types.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/mm.h> +#include <linux/printk.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/vmalloc.h> +#include <linux/debugfs.h> +#include <linux/uaccess.h> +#include <linux/kcov.h> + +/* + * kcov descriptor (one per opened debugfs file). + * State transitions of the descriptor: + * - initial state after open() + * - then there must be a single ioctl(KCOV_INIT_TRACE) call + * - then, mmap() call (several calls are allowed but not useful) + * - then, repeated enable/disable for a task (only one task a time allowed) + */ +struct kcov { + /* + * Reference counter. We keep one for: + * - opened file descriptor + * - task with enabled coverage (we can't unwire it from another task) + */ + atomic_t refcount; + /* The lock protects mode, size, area and t. */ + spinlock_t lock; + enum kcov_mode mode; + /* Size of arena (in long's for KCOV_MODE_TRACE). */ + unsigned size; + /* Coverage buffer shared with user space. */ + void *area; + /* Task for which we collect coverage, or NULL. */ + struct task_struct *t; +}; + +/* + * Entry point from instrumented code. + * This is called once per basic-block/edge. + */ +void __sanitizer_cov_trace_pc(void) +{ + struct task_struct *t; + enum kcov_mode mode; + + t = current; + /* + * We are interested in code coverage as a function of a syscall inputs, + * so we ignore code executed in interrupts. + */ + if (!t || in_interrupt()) + return; + mode = READ_ONCE(t->kcov_mode); + if (mode == KCOV_MODE_TRACE) { + unsigned long *area; + unsigned long pos; + + /* + * There is some code that runs in interrupts but for which + * in_interrupt() returns false (e.g. preempt_schedule_irq()). + * READ_ONCE()/barrier() effectively provides load-acquire wrt + * interrupts, there are paired barrier()/WRITE_ONCE() in + * kcov_ioctl_locked(). + */ + barrier(); + area = t->kcov_area; + /* The first word is number of subsequent PCs. */ + pos = READ_ONCE(area[0]) + 1; + if (likely(pos < t->kcov_size)) { + area[pos] = _RET_IP_; + WRITE_ONCE(area[0], pos); + } + } +} +EXPORT_SYMBOL(__sanitizer_cov_trace_pc); + +static void kcov_get(struct kcov *kcov) +{ + atomic_inc(&kcov->refcount); +} + +static void kcov_put(struct kcov *kcov) +{ + if (atomic_dec_and_test(&kcov->refcount)) { + vfree(kcov->area); + kfree(kcov); + } +} + +void kcov_task_init(struct task_struct *t) +{ + t->kcov_mode = KCOV_MODE_DISABLED; + t->kcov_size = 0; + t->kcov_area = NULL; + t->kcov = NULL; +} + +void kcov_task_exit(struct task_struct *t) +{ + struct kcov *kcov; + + kcov = t->kcov; + if (kcov == NULL) + return; + spin_lock(&kcov->lock); + if (WARN_ON(kcov->t != t)) { + spin_unlock(&kcov->lock); + return; + } + /* Just to not leave dangling references behind. */ + kcov_task_init(t); + kcov->t = NULL; + spin_unlock(&kcov->lock); + kcov_put(kcov); +} + +static int kcov_mmap(struct file *filep, struct vm_area_struct *vma) +{ + int res = 0; + void *area; + struct kcov *kcov = vma->vm_file->private_data; + unsigned long size, off; + struct page *page; + + area = vmalloc_user(vma->vm_end - vma->vm_start); + if (!area) + return -ENOMEM; + + spin_lock(&kcov->lock); + size = kcov->size * sizeof(unsigned long); + if (kcov->mode == KCOV_MODE_DISABLED || vma->vm_pgoff != 0 || + vma->vm_end - vma->vm_start != size) { + res = -EINVAL; + goto exit; + } + if (!kcov->area) { + kcov->area = area; + vma->vm_flags |= VM_DONTEXPAND; + spin_unlock(&kcov->lock); + for (off = 0; off < size; off += PAGE_SIZE) { + page = vmalloc_to_page(kcov->area + off); + if (vm_insert_page(vma, vma->vm_start + off, page)) + WARN_ONCE(1, "vm_insert_page() failed"); + } + return 0; + } +exit: + spin_unlock(&kcov->lock); + vfree(area); + return res; +} + +static int kcov_open(struct inode *inode, struct file *filep) +{ + struct kcov *kcov; + + kcov = kzalloc(sizeof(*kcov), GFP_KERNEL); + if (!kcov) + return -ENOMEM; + atomic_set(&kcov->refcount, 1); + spin_lock_init(&kcov->lock); + filep->private_data = kcov; + return nonseekable_open(inode, filep); +} + +static int kcov_close(struct inode *inode, struct file *filep) +{ + kcov_put(filep->private_data); + return 0; +} + +static int kcov_ioctl_locked(struct kcov *kcov, unsigned int cmd, + unsigned long arg) +{ + struct task_struct *t; + unsigned long size, unused; + + switch (cmd) { + case KCOV_INIT_TRACE: + /* + * Enable kcov in trace mode and setup buffer size. + * Must happen before anything else. + */ + if (kcov->mode != KCOV_MODE_DISABLED) + return -EBUSY; + /* + * Size must be at least 2 to hold current position and one PC. + * Later we allocate size * sizeof(unsigned long) memory, + * that must not overflow. + */ + size = arg; + if (size < 2 || size > INT_MAX / sizeof(unsigned long)) + return -EINVAL; + kcov->size = size; + kcov->mode = KCOV_MODE_TRACE; + return 0; + case KCOV_ENABLE: + /* + * Enable coverage for the current task. + * At this point user must have been enabled trace mode, + * and mmapped the file. Coverage collection is disabled only + * at task exit or voluntary by KCOV_DISABLE. After that it can + * be enabled for another task. + */ + unused = arg; + if (unused != 0 || kcov->mode == KCOV_MODE_DISABLED || + kcov->area == NULL) + return -EINVAL; + if (kcov->t != NULL) + return -EBUSY; + t = current; + /* Cache in task struct for performance. */ + t->kcov_size = kcov->size; + t->kcov_area = kcov->area; + /* See comment in __sanitizer_cov_trace_pc(). */ + barrier(); + WRITE_ONCE(t->kcov_mode, kcov->mode); + t->kcov = kcov; + kcov->t = t; + /* This is put either in kcov_task_exit() or in KCOV_DISABLE. */ + kcov_get(kcov); + return 0; + case KCOV_DISABLE: + /* Disable coverage for the current task. */ + unused = arg; + if (unused != 0 || current->kcov != kcov) + return -EINVAL; + t = current; + if (WARN_ON(kcov->t != t)) + return -EINVAL; + kcov_task_init(t); + kcov->t = NULL; + kcov_put(kcov); + return 0; + default: + return -ENOTTY; + } +} + +static long kcov_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) +{ + struct kcov *kcov; + int res; + + kcov = filep->private_data; + spin_lock(&kcov->lock); + res = kcov_ioctl_locked(kcov, cmd, arg); + spin_unlock(&kcov->lock); + return res; +} + +static const struct file_operations kcov_fops = { + .open = kcov_open, + .unlocked_ioctl = kcov_ioctl, + .mmap = kcov_mmap, + .release = kcov_close, +}; + +static int __init kcov_init(void) +{ + if (!debugfs_create_file("kcov", 0600, NULL, NULL, &kcov_fops)) { + pr_err("failed to create kcov in debugfs\n"); + return -ENOMEM; + } + return 0; +} + +device_initcall(kcov_init); diff --git a/kernel/kexec.c b/kernel/kexec.c index 4c5edc357923..ee70aef5cd81 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,6 +6,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> @@ -61,16 +63,16 @@ static int kimage_alloc_init(struct kimage **rimage, unsigned long entry, if (ret) goto out_free_image; - ret = sanity_check_segment_list(image); - if (ret) - goto out_free_image; - - /* Enable the special crash kernel control page allocation policy. */ if (kexec_on_panic) { + /* Enable special crash kernel control page alloc policy. */ image->control_page = crashk_res.start; image->type = KEXEC_TYPE_CRASH; } + ret = sanity_check_segment_list(image); + if (ret) + goto out_free_image; + /* * Find a location for the control code buffer, and add it * the vector of segments so that it's pages will also be diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index bd9f8a03cefa..8d34308ea449 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -6,7 +6,7 @@ * Version 2. See the file COPYING for more details. */ -#define pr_fmt(fmt) "kexec: " fmt +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/capability.h> #include <linux/mm.h> @@ -66,13 +66,15 @@ struct resource crashk_res = { .name = "Crash kernel", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL }; struct resource crashk_low_res = { .name = "Crash kernel", .start = 0, .end = 0, - .flags = IORESOURCE_BUSY | IORESOURCE_MEM + .flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM, + .desc = IORES_DESC_CRASH_KERNEL }; int kexec_should_crash(struct task_struct *p) @@ -310,12 +312,9 @@ static void kimage_free_pages(struct page *page) void kimage_free_page_list(struct list_head *list) { - struct list_head *pos, *next; + struct page *page, *next; - list_for_each_safe(pos, next, list) { - struct page *page; - - page = list_entry(pos, struct page, lru); + list_for_each_entry_safe(page, next, list, lru) { list_del(&page->lru); kimage_free_pages(page); } @@ -853,7 +852,12 @@ struct kimage *kexec_image; struct kimage *kexec_crash_image; int kexec_load_disabled; -void crash_kexec(struct pt_regs *regs) +/* + * No panic_cpu check version of crash_kexec(). This function is called + * only when panic_cpu holds the current CPU number; this is the only CPU + * which processes crash_kexec routines. + */ +void __crash_kexec(struct pt_regs *regs) { /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel @@ -876,6 +880,29 @@ void crash_kexec(struct pt_regs *regs) } } +void crash_kexec(struct pt_regs *regs) +{ + int old_cpu, this_cpu; + + /* + * Only one CPU is allowed to execute the crash_kexec() code as with + * panic(). Otherwise parallel calls of panic() and crash_kexec() + * may stop each other. To exclude them, we use panic_cpu here too. + */ + this_cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + if (old_cpu == PANIC_CPU_INVALID) { + /* This is the 1st CPU which comes here, so go ahead. */ + __crash_kexec(regs); + + /* + * Reset panic_cpu to allow another panic()/crash_kexec() + * call. + */ + atomic_set(&panic_cpu, PANIC_CPU_INVALID); + } +} + size_t crash_get_memory_size(void) { size_t size = 0; @@ -934,7 +961,7 @@ int crash_shrink_memory(unsigned long new_size) ram_res->start = end; ram_res->end = crashk_res.end; - ram_res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; + ram_res->flags = IORESOURCE_BUSY | IORESOURCE_SYSTEM_RAM; ram_res->name = "System RAM"; crashk_res.end = end - 1; @@ -1027,7 +1054,7 @@ static int __init crash_notes_memory_init(void) crash_notes = __alloc_percpu(size, align); if (!crash_notes) { - pr_warn("Kexec: Memory allocation for saving cpu register states failed\n"); + pr_warn("Memory allocation for saving cpu register states failed\n"); return -ENOMEM; } return 0; diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 6a9a3f2a0e8e..c72d2ff5896e 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -9,6 +9,8 @@ * Version 2. See the file COPYING for more details. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/capability.h> #include <linux/mm.h> #include <linux/file.h> @@ -16,6 +18,7 @@ #include <linux/kexec.h> #include <linux/mutex.h> #include <linux/list.h> +#include <linux/fs.h> #include <crypto/hash.h> #include <crypto/sha.h> #include <linux/syscalls.h> @@ -31,65 +34,6 @@ size_t __weak kexec_purgatory_size = 0; static int kexec_calculate_store_digests(struct kimage *image); -static int copy_file_from_fd(int fd, void **buf, unsigned long *buf_len) -{ - struct fd f = fdget(fd); - int ret; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -EBADF; - - ret = vfs_getattr(&f.file->f_path, &stat); - if (ret) - goto out; - - if (stat.size > INT_MAX) { - ret = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - ret = -EINVAL; - goto out; - } - - *buf = vmalloc(stat.size); - if (!*buf) { - ret = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(*buf) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(*buf); - ret = bytes; - goto out; - } - - if (bytes == 0) - break; - pos += bytes; - } - - if (pos != stat.size) { - ret = -EBADF; - vfree(*buf); - goto out; - } - - *buf_len = pos; -out: - fdput(f); - return ret; -} - /* Architectures can provide this probe function */ int __weak arch_kexec_kernel_image_probe(struct kimage *image, void *buf, unsigned long buf_len) @@ -107,11 +51,13 @@ int __weak arch_kimage_file_post_load_cleanup(struct kimage *image) return -EINVAL; } +#ifdef CONFIG_KEXEC_VERIFY_SIG int __weak arch_kexec_kernel_verify_sig(struct kimage *image, void *buf, unsigned long buf_len) { return -EKEYREJECTED; } +#endif /* Apply relocations of type RELA */ int __weak @@ -178,16 +124,17 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, { int ret = 0; void *ldata; + loff_t size; - ret = copy_file_from_fd(kernel_fd, &image->kernel_buf, - &image->kernel_buf_len); + ret = kernel_read_file_from_fd(kernel_fd, &image->kernel_buf, + &size, INT_MAX, READING_KEXEC_IMAGE); if (ret) return ret; + image->kernel_buf_len = size; /* Call arch image probe handlers */ ret = arch_kexec_kernel_image_probe(image, image->kernel_buf, image->kernel_buf_len); - if (ret) goto out; @@ -202,10 +149,12 @@ kimage_file_prepare_segments(struct kimage *image, int kernel_fd, int initrd_fd, #endif /* It is possible that there no initramfs is being loaded */ if (!(flags & KEXEC_FILE_NO_INITRAMFS)) { - ret = copy_file_from_fd(initrd_fd, &image->initrd_buf, - &image->initrd_buf_len); + ret = kernel_read_file_from_fd(initrd_fd, &image->initrd_buf, + &size, INT_MAX, + READING_KEXEC_INITRAMFS); if (ret) goto out; + image->initrd_buf_len = size; } if (cmdline_len) { @@ -520,10 +469,10 @@ int kexec_add_buffer(struct kimage *image, char *buffer, unsigned long bufsz, /* Walk the RAM ranges and allocate a suitable range for the buffer */ if (image->type == KEXEC_TYPE_CRASH) - ret = walk_iomem_res("Crash kernel", - IORESOURCE_MEM | IORESOURCE_BUSY, - crashk_res.start, crashk_res.end, kbuf, - locate_mem_hole_callback); + ret = walk_iomem_res_desc(crashk_res.desc, + IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY, + crashk_res.start, crashk_res.end, kbuf, + locate_mem_hole_callback); else ret = walk_system_ram_res(0, -1, kbuf, locate_mem_hole_callback); diff --git a/kernel/kexec_internal.h b/kernel/kexec_internal.h index e4392a698ad4..0a52315d9c62 100644 --- a/kernel/kexec_internal.h +++ b/kernel/kexec_internal.h @@ -15,6 +15,27 @@ int kimage_is_destination_range(struct kimage *image, extern struct mutex kexec_mutex; #ifdef CONFIG_KEXEC_FILE +struct kexec_sha_region { + unsigned long start; + unsigned long len; +}; + +/* + * Keeps track of buffer parameters as provided by caller for requesting + * memory placement of buffer. + */ +struct kexec_buf { + struct kimage *image; + char *buffer; + unsigned long bufsz; + unsigned long mem; + unsigned long memsz; + unsigned long buf_align; + unsigned long buf_min; + unsigned long buf_max; + bool top_down; /* allocate from top of memory hole */ +}; + void kimage_file_post_load_cleanup(struct kimage *image); #else /* CONFIG_KEXEC_FILE */ static inline void kimage_file_post_load_cleanup(struct kimage *image) { } diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index e83b26464061..152da4a48867 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -20,7 +20,7 @@ #include <linux/capability.h> #include <linux/compiler.h> -#include <linux/rcupdate.h> /* rcu_expedited */ +#include <linux/rcupdate.h> /* rcu_expedited and rcu_normal */ #define KERNEL_ATTR_RO(_name) \ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) @@ -144,11 +144,12 @@ static ssize_t fscaps_show(struct kobject *kobj, } KERNEL_ATTR_RO(fscaps); +#ifndef CONFIG_TINY_RCU int rcu_expedited; static ssize_t rcu_expedited_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) { - return sprintf(buf, "%d\n", rcu_expedited); + return sprintf(buf, "%d\n", READ_ONCE(rcu_expedited)); } static ssize_t rcu_expedited_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -161,6 +162,24 @@ static ssize_t rcu_expedited_store(struct kobject *kobj, } KERNEL_ATTR_RW(rcu_expedited); +int rcu_normal; +static ssize_t rcu_normal_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", READ_ONCE(rcu_normal)); +} +static ssize_t rcu_normal_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + if (kstrtoint(buf, 0, &rcu_normal)) + return -EINVAL; + + return count; +} +KERNEL_ATTR_RW(rcu_normal); +#endif /* #ifndef CONFIG_TINY_RCU */ + /* * Make /sys/kernel/notes give the raw contents of our kernel .notes section. */ @@ -202,7 +221,10 @@ static struct attribute * kernel_attrs[] = { &kexec_crash_size_attr.attr, &vmcoreinfo_attr.attr, #endif +#ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, + &rcu_normal_attr.attr, +#endif NULL }; diff --git a/kernel/latencytop.c b/kernel/latencytop.c index a02812743a7e..b5c30d9f46c5 100644 --- a/kernel/latencytop.c +++ b/kernel/latencytop.c @@ -47,12 +47,12 @@ * of times) */ -#include <linux/latencytop.h> #include <linux/kallsyms.h> #include <linux/seq_file.h> #include <linux/notifier.h> #include <linux/spinlock.h> #include <linux/proc_fs.h> +#include <linux/latencytop.h> #include <linux/export.h> #include <linux/sched.h> #include <linux/list.h> @@ -289,4 +289,16 @@ static int __init init_lstats_procfs(void) proc_create("latency_stats", 0644, NULL, &lstats_fops); return 0; } + +int sysctl_latencytop(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int err; + + err = proc_dointvec(table, write, buffer, lenp, ppos); + if (latencytop_enabled) + force_schedstat_enabled(); + + return err; +} device_initcall(init_lstats_procfs); diff --git a/kernel/livepatch/core.c b/kernel/livepatch/core.c index 6e5344112419..d68fbf63b083 100644 --- a/kernel/livepatch/core.c +++ b/kernel/livepatch/core.c @@ -28,6 +28,7 @@ #include <linux/list.h> #include <linux/kallsyms.h> #include <linux/livepatch.h> +#include <asm/cacheflush.h> /** * struct klp_ops - structure for tracking registered ftrace ops structs @@ -98,12 +99,12 @@ static void klp_find_object_module(struct klp_object *obj) /* * We do not want to block removal of patched modules and therefore * we do not take a reference here. The patches are removed by - * a going module handler instead. + * klp_module_going() instead. */ mod = find_module(obj->name); /* - * Do not mess work of the module coming and going notifiers. - * Note that the patch might still be needed before the going handler + * Do not mess work of klp_module_coming() and klp_module_going(). + * Note that the patch might still be needed before klp_module_going() * is called. Module functions can be called even in the GOING state * until mod->exit() finishes. This is especially important for * patches that modify semantic of the functions. @@ -135,13 +136,8 @@ struct klp_find_arg { const char *objname; const char *name; unsigned long addr; - /* - * If count == 0, the symbol was not found. If count == 1, a unique - * match was found and addr is set. If count > 1, there is - * unresolvable ambiguity among "count" number of symbols with the same - * name in the same object. - */ unsigned long count; + unsigned long pos; }; static int klp_find_callback(void *data, const char *name, @@ -158,37 +154,48 @@ static int klp_find_callback(void *data, const char *name, if (args->objname && strcmp(args->objname, mod->name)) return 0; - /* - * args->addr might be overwritten if another match is found - * but klp_find_object_symbol() handles this and only returns the - * addr if count == 1. - */ args->addr = addr; args->count++; + /* + * Finish the search when the symbol is found for the desired position + * or the position is not defined for a non-unique symbol. + */ + if ((args->pos && (args->count == args->pos)) || + (!args->pos && (args->count > 1))) + return 1; + return 0; } static int klp_find_object_symbol(const char *objname, const char *name, - unsigned long *addr) + unsigned long sympos, unsigned long *addr) { struct klp_find_arg args = { .objname = objname, .name = name, .addr = 0, - .count = 0 + .count = 0, + .pos = sympos, }; mutex_lock(&module_mutex); kallsyms_on_each_symbol(klp_find_callback, &args); mutex_unlock(&module_mutex); - if (args.count == 0) + /* + * Ensure an address was found. If sympos is 0, ensure symbol is unique; + * otherwise ensure the symbol position count matches sympos. + */ + if (args.addr == 0) pr_err("symbol '%s' not found in symbol table\n", name); - else if (args.count > 1) - pr_err("unresolvable ambiguity (%lu matches) on symbol '%s' in object '%s'\n", - args.count, name, objname); - else { + else if (args.count > 1 && sympos == 0) { + pr_err("unresolvable ambiguity for symbol '%s' in object '%s'\n", + name, objname); + } else if (sympos != args.count && sympos > 0) { + pr_err("symbol position %lu for symbol '%s' in object '%s' not found\n", + sympos, name, objname ? objname : "vmlinux"); + } else { *addr = args.addr; return 0; } @@ -197,66 +204,6 @@ static int klp_find_object_symbol(const char *objname, const char *name, return -EINVAL; } -struct klp_verify_args { - const char *name; - const unsigned long addr; -}; - -static int klp_verify_callback(void *data, const char *name, - struct module *mod, unsigned long addr) -{ - struct klp_verify_args *args = data; - - if (!mod && - !strcmp(args->name, name) && - args->addr == addr) - return 1; - - return 0; -} - -static int klp_verify_vmlinux_symbol(const char *name, unsigned long addr) -{ - struct klp_verify_args args = { - .name = name, - .addr = addr, - }; - int ret; - - mutex_lock(&module_mutex); - ret = kallsyms_on_each_symbol(klp_verify_callback, &args); - mutex_unlock(&module_mutex); - - if (!ret) { - pr_err("symbol '%s' not found at specified address 0x%016lx, kernel mismatch?\n", - name, addr); - return -EINVAL; - } - - return 0; -} - -static int klp_find_verify_func_addr(struct klp_object *obj, - struct klp_func *func) -{ - int ret; - -#if defined(CONFIG_RANDOMIZE_BASE) - /* If KASLR has been enabled, adjust old_addr accordingly */ - if (kaslr_enabled() && func->old_addr) - func->old_addr += kaslr_offset(); -#endif - - if (!func->old_addr || klp_is_module(obj)) - ret = klp_find_object_symbol(obj->name, func->old_name, - &func->old_addr); - else - ret = klp_verify_vmlinux_symbol(func->old_name, - func->old_addr); - - return ret; -} - /* * external symbols are located outside the parent object (where the parent * object is either vmlinux or the kmod being patched). @@ -276,14 +223,18 @@ static int klp_find_external_symbol(struct module *pmod, const char *name, } preempt_enable(); - /* otherwise check if it's in another .o within the patch module */ - return klp_find_object_symbol(pmod->name, name, addr); + /* + * Check if it's in another .o within the patch module. This also + * checks that the external symbol is unique. + */ + return klp_find_object_symbol(pmod->name, name, 0, addr); } static int klp_write_object_relocations(struct module *pmod, struct klp_object *obj) { - int ret; + int ret = 0; + unsigned long val; struct klp_reloc *reloc; if (WARN_ON(!klp_is_object_loaded(obj))) @@ -292,35 +243,38 @@ static int klp_write_object_relocations(struct module *pmod, if (WARN_ON(!obj->relocs)) return -EINVAL; + module_disable_ro(pmod); + for (reloc = obj->relocs; reloc->name; reloc++) { - if (!klp_is_module(obj)) { - ret = klp_verify_vmlinux_symbol(reloc->name, - reloc->val); - if (ret) - return ret; - } else { - /* module, reloc->val needs to be discovered */ - if (reloc->external) - ret = klp_find_external_symbol(pmod, - reloc->name, - &reloc->val); - else - ret = klp_find_object_symbol(obj->mod->name, - reloc->name, - &reloc->val); - if (ret) - return ret; - } + /* discover the address of the referenced symbol */ + if (reloc->external) { + if (reloc->sympos > 0) { + pr_err("non-zero sympos for external reloc symbol '%s' is not supported\n", + reloc->name); + ret = -EINVAL; + goto out; + } + ret = klp_find_external_symbol(pmod, reloc->name, &val); + } else + ret = klp_find_object_symbol(obj->name, + reloc->name, + reloc->sympos, + &val); + if (ret) + goto out; + ret = klp_write_module_reloc(pmod, reloc->type, reloc->loc, - reloc->val + reloc->addend); + val + reloc->addend); if (ret) { pr_err("relocation failed for symbol '%s' at 0x%016lx (%d)\n", - reloc->name, reloc->val, ret); - return ret; + reloc->name, val, ret); + goto out; } } - return 0; +out: + module_enable_ro(pmod); + return ret; } static void notrace klp_ftrace_handler(unsigned long ip, @@ -587,7 +541,7 @@ EXPORT_SYMBOL_GPL(klp_enable_patch); * /sys/kernel/livepatch/<patch> * /sys/kernel/livepatch/<patch>/enabled * /sys/kernel/livepatch/<patch>/<object> - * /sys/kernel/livepatch/<patch>/<object>/<func> + * /sys/kernel/livepatch/<patch>/<object>/<function,sympos> */ static ssize_t enabled_store(struct kobject *kobj, struct kobj_attribute *attr, @@ -732,8 +686,14 @@ static int klp_init_func(struct klp_object *obj, struct klp_func *func) INIT_LIST_HEAD(&func->stack_node); func->state = KLP_DISABLED; + /* The format for the sysfs directory is <function,sympos> where sympos + * is the nth occurrence of this symbol in kallsyms for the patched + * object. If the user selects 0 for old_sympos, then 1 will be used + * since a unique symbol will be the first occurrence. + */ return kobject_init_and_add(&func->kobj, &klp_ktype_func, - &obj->kobj, "%s", func->old_name); + &obj->kobj, "%s,%lu", func->old_name, + func->old_sympos ? func->old_sympos : 1); } /* parts of the initialization that is done only when the object is loaded */ @@ -750,7 +710,9 @@ static int klp_init_object_loaded(struct klp_patch *patch, } klp_for_each_func(obj, func) { - ret = klp_find_verify_func_addr(obj, func); + ret = klp_find_object_symbol(obj->name, func->old_name, + func->old_sympos, + &func->old_addr); if (ret) return ret; } @@ -904,103 +866,108 @@ int klp_register_patch(struct klp_patch *patch) } EXPORT_SYMBOL_GPL(klp_register_patch); -static int klp_module_notify_coming(struct klp_patch *patch, - struct klp_object *obj) +int klp_module_coming(struct module *mod) { - struct module *pmod = patch->mod; - struct module *mod = obj->mod; int ret; + struct klp_patch *patch; + struct klp_object *obj; - ret = klp_init_object_loaded(patch, obj); - if (ret) { - pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; - } + if (WARN_ON(mod->state != MODULE_STATE_COMING)) + return -EINVAL; - if (patch->state == KLP_DISABLED) - return 0; + mutex_lock(&klp_mutex); + /* + * Each module has to know that klp_module_coming() + * has been called. We never know what module will + * get patched by a new patch. + */ + mod->klp_alive = true; - pr_notice("applying patch '%s' to loading module '%s'\n", - pmod->name, mod->name); + list_for_each_entry(patch, &klp_patches, list) { + klp_for_each_object(patch, obj) { + if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) + continue; - ret = klp_enable_object(obj); - if (ret) - pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", - pmod->name, mod->name, ret); - return ret; -} + obj->mod = mod; -static void klp_module_notify_going(struct klp_patch *patch, - struct klp_object *obj) -{ - struct module *pmod = patch->mod; - struct module *mod = obj->mod; + ret = klp_init_object_loaded(patch, obj); + if (ret) { + pr_warn("failed to initialize patch '%s' for module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } + + if (patch->state == KLP_DISABLED) + break; + + pr_notice("applying patch '%s' to loading module '%s'\n", + patch->mod->name, obj->mod->name); - if (patch->state == KLP_DISABLED) - goto disabled; + ret = klp_enable_object(obj); + if (ret) { + pr_warn("failed to apply patch '%s' to module '%s' (%d)\n", + patch->mod->name, obj->mod->name, ret); + goto err; + } - pr_notice("reverting patch '%s' on unloading module '%s'\n", - pmod->name, mod->name); + break; + } + } - klp_disable_object(obj); + mutex_unlock(&klp_mutex); -disabled: + return 0; + +err: + /* + * If a patch is unsuccessfully applied, return + * error to the module loader. + */ + pr_warn("patch '%s' failed for module '%s', refusing to load module '%s'\n", + patch->mod->name, obj->mod->name, obj->mod->name); + mod->klp_alive = false; klp_free_object_loaded(obj); + mutex_unlock(&klp_mutex); + + return ret; } -static int klp_module_notify(struct notifier_block *nb, unsigned long action, - void *data) +void klp_module_going(struct module *mod) { - int ret; - struct module *mod = data; struct klp_patch *patch; struct klp_object *obj; - if (action != MODULE_STATE_COMING && action != MODULE_STATE_GOING) - return 0; + if (WARN_ON(mod->state != MODULE_STATE_GOING && + mod->state != MODULE_STATE_COMING)) + return; mutex_lock(&klp_mutex); - /* - * Each module has to know that the notifier has been called. - * We never know what module will get patched by a new patch. + * Each module has to know that klp_module_going() + * has been called. We never know what module will + * get patched by a new patch. */ - if (action == MODULE_STATE_COMING) - mod->klp_alive = true; - else /* MODULE_STATE_GOING */ - mod->klp_alive = false; + mod->klp_alive = false; list_for_each_entry(patch, &klp_patches, list) { klp_for_each_object(patch, obj) { if (!klp_is_module(obj) || strcmp(obj->name, mod->name)) continue; - if (action == MODULE_STATE_COMING) { - obj->mod = mod; - ret = klp_module_notify_coming(patch, obj); - if (ret) { - obj->mod = NULL; - pr_warn("patch '%s' is in an inconsistent state!\n", - patch->mod->name); - } - } else /* MODULE_STATE_GOING */ - klp_module_notify_going(patch, obj); + if (patch->state != KLP_DISABLED) { + pr_notice("reverting patch '%s' on unloading module '%s'\n", + patch->mod->name, obj->mod->name); + klp_disable_object(obj); + } + klp_free_object_loaded(obj); break; } } mutex_unlock(&klp_mutex); - - return 0; } -static struct notifier_block klp_module_nb = { - .notifier_call = klp_module_notify, - .priority = INT_MIN+1, /* called late but before ftrace notifier */ -}; - static int __init klp_init(void) { int ret; @@ -1011,21 +978,11 @@ static int __init klp_init(void) return -EINVAL; } - ret = register_module_notifier(&klp_module_nb); - if (ret) - return ret; - klp_root_kobj = kobject_create_and_add("livepatch", kernel_kobj); - if (!klp_root_kobj) { - ret = -ENOMEM; - goto unregister; - } + if (!klp_root_kobj) + return -ENOMEM; return 0; - -unregister: - unregister_module_notifier(&klp_module_nb); - return ret; } module_init(klp_init); diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 8e96f6cc2a4a..31322a4275cd 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,3 +1,6 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 4e49cc4c9952..ed9410936a22 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * this code maps all the lock dependencies as they occur in a live kernel * and will warn about the following classes of locking bugs: @@ -123,8 +123,6 @@ static inline int debug_locks_off_graph_unlock(void) return ret; } -static int lockdep_initialized; - unsigned long nr_list_entries; static struct lock_list list_entries[MAX_LOCKDEP_ENTRIES]; @@ -150,8 +148,7 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock) } #ifdef CONFIG_LOCK_STAT -static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], - cpu_lock_stats); +static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], cpu_lock_stats); static inline u64 lockstat_clock(void) { @@ -292,7 +289,7 @@ LIST_HEAD(all_lock_classes); #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) -static struct list_head classhash_table[CLASSHASH_SIZE]; +static struct hlist_head classhash_table[CLASSHASH_SIZE]; /* * We put the lock dependency chains into a hash-table as well, to cache @@ -303,7 +300,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) -static struct list_head chainhash_table[CHAINHASH_SIZE]; +static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* * The hash key of the lock dependency chains is a hash itself too: @@ -434,19 +431,6 @@ unsigned int max_lockdep_depth; #ifdef CONFIG_DEBUG_LOCKDEP /* - * We cannot printk in early bootup code. Not even early_printk() - * might work. So we mark any initialization errors and printk - * about it later on, in lockdep_info(). - */ -static int lockdep_init_error; -static const char *lock_init_error; -static unsigned long lockdep_init_trace_data[20]; -static struct stack_trace lockdep_init_trace = { - .max_entries = ARRAY_SIZE(lockdep_init_trace_data), - .entries = lockdep_init_trace_data, -}; - -/* * Various lockdep statistics: */ DEFINE_PER_CPU(struct lockdep_stats, lockdep_stats); @@ -666,23 +650,9 @@ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; -#ifdef CONFIG_DEBUG_LOCKDEP - /* - * If the architecture calls into lockdep before initializing - * the hashes then we'll warn about it later. (we cannot printk - * right now) - */ - if (unlikely(!lockdep_initialized)) { - lockdep_init(); - lockdep_init_error = 1; - lock_init_error = lock->name; - save_stack_trace(&lockdep_init_trace); - } -#endif - if (unlikely(subclass >= MAX_LOCKDEP_SUBCLASSES)) { debug_locks_off(); printk(KERN_ERR @@ -719,7 +689,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -742,7 +712,7 @@ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -774,7 +744,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; } @@ -805,7 +775,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ - list_add_tail_rcu(&class->hash_entry, hash_head); + hlist_add_head_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ @@ -1822,7 +1792,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) + struct held_lock *next, int distance, int *stack_saved) { struct lock_list *entry; int ret; @@ -1883,8 +1853,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } - if (!trylock_loop && !save_trace(&trace)) - return 0; + if (!*stack_saved) { + if (!save_trace(&trace)) + return 0; + *stack_saved = 1; + } /* * Ok, all validations passed, add the new lock @@ -1907,6 +1880,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + /* We drop graph lock, so another thread can overwrite trace. */ + *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1929,7 +1904,7 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int trylock_loop = 0; + int stack_saved = 0; struct held_lock *hlock; /* @@ -1956,7 +1931,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) + distance, &stack_saved)) return 0; /* * Stop after the first non-trylock entry, @@ -1979,7 +1954,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; - trylock_loop = 1; } return 1; out_bug: @@ -2007,6 +1981,130 @@ struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) } /* + * Returns the index of the first held_lock of the current chain + */ +static inline int get_first_held_lock(struct task_struct *curr, + struct held_lock *hlock) +{ + int i; + struct held_lock *hlock_curr; + + for (i = curr->lockdep_depth - 1; i >= 0; i--) { + hlock_curr = curr->held_locks + i; + if (hlock_curr->irq_context != hlock->irq_context) + break; + + } + + return ++i; +} + +#ifdef CONFIG_DEBUG_LOCKDEP +/* + * Returns the next chain_key iteration + */ +static u64 print_chain_key_iteration(int class_idx, u64 chain_key) +{ + u64 new_chain_key = iterate_chain_key(chain_key, class_idx); + + printk(" class_idx:%d -> chain_key:%016Lx", + class_idx, + (unsigned long long)new_chain_key); + return new_chain_key; +} + +static void +print_chain_keys_held_locks(struct task_struct *curr, struct held_lock *hlock_next) +{ + struct held_lock *hlock; + u64 chain_key = 0; + int depth = curr->lockdep_depth; + int i; + + printk("depth: %u\n", depth + 1); + for (i = get_first_held_lock(curr, hlock_next); i < depth; i++) { + hlock = curr->held_locks + i; + chain_key = print_chain_key_iteration(hlock->class_idx, chain_key); + + print_lock(hlock); + } + + print_chain_key_iteration(hlock_next->class_idx, chain_key); + print_lock(hlock_next); +} + +static void print_chain_keys_chain(struct lock_chain *chain) +{ + int i; + u64 chain_key = 0; + int class_id; + + printk("depth: %u\n", chain->depth); + for (i = 0; i < chain->depth; i++) { + class_id = chain_hlocks[chain->base + i]; + chain_key = print_chain_key_iteration(class_id + 1, chain_key); + + print_lock_name(lock_classes + class_id); + printk("\n"); + } +} + +static void print_collision(struct task_struct *curr, + struct held_lock *hlock_next, + struct lock_chain *chain) +{ + printk("\n"); + printk("======================\n"); + printk("[chain_key collision ]\n"); + print_kernel_ident(); + printk("----------------------\n"); + printk("%s/%d: ", current->comm, task_pid_nr(current)); + printk("Hash chain already cached but the contents don't match!\n"); + + printk("Held locks:"); + print_chain_keys_held_locks(curr, hlock_next); + + printk("Locks in cached chain:"); + print_chain_keys_chain(chain); + + printk("\nstack backtrace:\n"); + dump_stack(); +} +#endif + +/* + * Checks whether the chain and the current held locks are consistent + * in depth and also in content. If they are not it most likely means + * that there was a collision during the calculation of the chain_key. + * Returns: 0 not passed, 1 passed + */ +static int check_no_collision(struct task_struct *curr, + struct held_lock *hlock, + struct lock_chain *chain) +{ +#ifdef CONFIG_DEBUG_LOCKDEP + int i, j, id; + + i = get_first_held_lock(curr, hlock); + + if (DEBUG_LOCKS_WARN_ON(chain->depth != curr->lockdep_depth - (i - 1))) { + print_collision(curr, hlock, chain); + return 0; + } + + for (j = 0; j < chain->depth - 1; j++, i++) { + id = curr->held_locks[i].class_idx - 1; + + if (DEBUG_LOCKS_WARN_ON(chain_hlocks[chain->base + j] != id)) { + print_collision(curr, hlock, chain); + return 0; + } + } +#endif + return 1; +} + +/* * Look up a dependency chain. If the key is not present yet then * add it and return 1 - in this case the new dependency chain is * validated. If the key is already hashed, return 0. @@ -2017,9 +2115,8 @@ static inline int lookup_chain_cache(struct task_struct *curr, u64 chain_key) { struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); + struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; - struct held_lock *hlock_curr; int i, j; /* @@ -2033,10 +2130,13 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry_rcu(chain, hash_head, entry) { + hlist_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); + if (!check_no_collision(curr, hlock, chain)) + return 0; + if (very_verbose(class)) printk("\nhash chain already cached, key: " "%016Lx tail class: [%p] %s\n", @@ -2057,7 +2157,7 @@ cache_hit: /* * We have to walk the chain again locked - to avoid duplicates: */ - list_for_each_entry(chain, hash_head, entry) { + hlist_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; @@ -2074,13 +2174,7 @@ cache_hit: chain = lock_chains + nr_lock_chains++; chain->chain_key = chain_key; chain->irq_context = hlock->irq_context; - /* Find the first held_lock of current chain */ - for (i = curr->lockdep_depth - 1; i >= 0; i--) { - hlock_curr = curr->held_locks + i; - if (hlock_curr->irq_context != hlock->irq_context) - break; - } - i++; + i = get_first_held_lock(curr, hlock); chain->depth = curr->lockdep_depth + 1 - i; if (likely(nr_chain_hlocks + chain->depth <= MAX_LOCKDEP_CHAIN_HLOCKS)) { chain->base = nr_chain_hlocks; @@ -2091,7 +2185,7 @@ cache_hit: } chain_hlocks[chain->base + j] = class - lock_classes; } - list_add_tail_rcu(&chain->entry, hash_head); + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -2168,7 +2262,7 @@ static void check_chain_key(struct task_struct *curr) { #ifdef CONFIG_DEBUG_LOCKDEP struct held_lock *hlock, *prev_hlock = NULL; - unsigned int i, id; + unsigned int i; u64 chain_key = 0; for (i = 0; i < curr->lockdep_depth; i++) { @@ -2185,17 +2279,16 @@ static void check_chain_key(struct task_struct *curr) (unsigned long long)hlock->prev_chain_key); return; } - id = hlock->class_idx - 1; /* * Whoops ran out of static storage again? */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(hlock->class_idx > MAX_LOCKDEP_KEYS)) return; if (prev_hlock && (prev_hlock->irq_context != hlock->irq_context)) chain_key = 0; - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, hlock->class_idx); prev_hlock = hlock; } if (chain_key != curr->curr_chain_key) { @@ -2738,7 +2831,7 @@ static void __lockdep_trace_alloc(gfp_t gfp_mask, unsigned long flags) return; /* no reclaim without waiting on it */ - if (!(gfp_mask & __GFP_WAIT)) + if (!(gfp_mask & __GFP_DIRECT_RECLAIM)) return; /* this guy won't enter reclaim */ @@ -3073,7 +3166,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, struct task_struct *curr = current; struct lock_class *class = NULL; struct held_lock *hlock; - unsigned int depth, id; + unsigned int depth; int chain_head = 0; int class_idx; u64 chain_key; @@ -3176,11 +3269,10 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, * The 'key ID' is what is the most compact key value to drive * the hash, not class->key. */ - id = class - lock_classes; /* * Whoops, we did it again.. ran straight out of our static allocation. */ - if (DEBUG_LOCKS_WARN_ON(id >= MAX_LOCKDEP_KEYS)) + if (DEBUG_LOCKS_WARN_ON(class_idx > MAX_LOCKDEP_KEYS)) return 0; chain_key = curr->curr_chain_key; @@ -3198,7 +3290,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass, chain_key = 0; chain_head = 1; } - chain_key = iterate_chain_key(chain_key, id); + chain_key = iterate_chain_key(chain_key, class_idx); if (nest_lock && !__lock_is_held(nest_lock)) return print_lock_nested_lock_not_held(curr, hlock, ip); @@ -3875,7 +3967,7 @@ void lockdep_reset(void) nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -3894,7 +3986,7 @@ static void zap_class(struct lock_class *class) /* * Unhash the class and remove it from the all_lock_classes list: */ - list_del_rcu(&class->hash_entry); + hlist_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); @@ -3917,7 +4009,7 @@ static inline int within(const void *addr, void *start, unsigned long size) void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i; int locked; @@ -3930,9 +4022,7 @@ void lockdep_free_key_range(void *start, unsigned long size) */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3962,7 +4052,7 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i, j; int locked; @@ -3987,9 +4077,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) @@ -4013,28 +4101,6 @@ out_restore: raw_local_irq_restore(flags); } -void lockdep_init(void) -{ - int i; - - /* - * Some architectures have their own start_kernel() - * code which calls lockdep_init(), while we also - * call lockdep_init() from the start_kernel() itself, - * and we want to initialize the hashes only once: - */ - if (lockdep_initialized) - return; - - for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); - - for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); - - lockdep_initialized = 1; -} - void __init lockdep_info(void) { printk("Lock dependency validator: Copyright (c) 2006 Red Hat, Inc., Ingo Molnar\n"); @@ -4061,14 +4127,6 @@ void __init lockdep_info(void) printk(" per task-struct memory footprint: %lu bytes\n", sizeof(struct held_lock) * MAX_LOCK_DEPTH); - -#ifdef CONFIG_DEBUG_LOCKDEP - if (lockdep_init_error) { - printk("WARNING: lockdep init error: lock '%s' was acquired before lockdep_init().\n", lock_init_error); - printk("Call stack leading to lockdep invocation was:\n"); - print_stack_trace(&lockdep_init_trace, 0); - } -#endif } static void diff --git a/kernel/locking/lockdep_proc.c b/kernel/locking/lockdep_proc.c index d83d798bef95..dbb61a302548 100644 --- a/kernel/locking/lockdep_proc.c +++ b/kernel/locking/lockdep_proc.c @@ -6,7 +6,7 @@ * Started by Ingo Molnar: * * Copyright (C) 2006,2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra * * Code for /proc/lockdep and /proc/lockdep_stats: * diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index 5b9102a47ea5..c835270f0c2f 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -67,7 +67,13 @@ void mcs_spin_lock(struct mcs_spinlock **lock, struct mcs_spinlock *node) node->locked = 0; node->next = NULL; - prev = xchg_acquire(lock, node); + /* + * We rely on the full barrier with global transitivity implied by the + * below xchg() to order the initialization stores above against any + * observation of @node. And to provide the ACQUIRE ordering associated + * with a LOCK primitive. + */ + prev = xchg(lock, node); if (likely(prev == NULL)) { /* * Lock acquired, don't need to set node->locked to 1. Threads diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 0551c219c40e..e364b424b019 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -716,6 +716,7 @@ static inline void __mutex_unlock_common_slowpath(struct mutex *lock, int nested) { unsigned long flags; + WAKE_Q(wake_q); /* * As a performance measurement, release the lock before doing other @@ -743,11 +744,11 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) struct mutex_waiter, list); debug_mutex_wake_waiter(lock, waiter); - - wake_up_process(waiter->task); + wake_q_add(&wake_q, waiter->task); } spin_unlock_mutex(&lock->wait_lock, flags); + wake_up_q(&wake_q); } /* diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index d092a0c9c2d4..05a37857ab55 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -93,10 +93,12 @@ bool osq_lock(struct optimistic_spin_queue *lock) node->cpu = curr; /* - * ACQUIRE semantics, pairs with corresponding RELEASE - * in unlock() uncontended, or fastpath. + * We need both ACQUIRE (pairs with corresponding RELEASE in + * unlock() uncontended, or fastpath) and RELEASE (to publish + * the node fields we just initialised) semantics when updating + * the lock tail. */ - old = atomic_xchg_acquire(&lock->tail, curr); + old = atomic_xchg(&lock->tail, curr); if (old == OSQ_UNLOCKED_VAL) return true; diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 87e9ce6a63c5..ce2f75e32ae1 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -14,8 +14,9 @@ * (C) Copyright 2013-2015 Hewlett-Packard Development Company, L.P. * (C) Copyright 2013-2014 Red Hat, Inc. * (C) Copyright 2015 Intel Corp. + * (C) Copyright 2015 Hewlett-Packard Enterprise Development LP * - * Authors: Waiman Long <waiman.long@hp.com> + * Authors: Waiman Long <waiman.long@hpe.com> * Peter Zijlstra <peterz@infradead.org> */ @@ -176,7 +177,12 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) { struct __qspinlock *l = (void *)lock; - return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; + /* + * Use release semantics to make sure that the MCS node is properly + * initialized before changing the tail code. + */ + return (u32)xchg_release(&l->tail, + tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET; } #else /* _Q_PENDING_BITS == 8 */ @@ -208,7 +214,11 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) for (;;) { new = (val & _Q_LOCKED_PENDING_MASK) | tail; - old = atomic_cmpxchg(&lock->val, val, new); + /* + * Use release semantics to make sure that the MCS node is + * properly initialized before changing the tail code. + */ + old = atomic_cmpxchg_release(&lock->val, val, new); if (old == val) break; @@ -238,18 +248,20 @@ static __always_inline void set_locked(struct qspinlock *lock) */ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { } +static __always_inline void __pv_wait_node(struct mcs_spinlock *node, + struct mcs_spinlock *prev) { } static __always_inline void __pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { } -static __always_inline void __pv_wait_head(struct qspinlock *lock, - struct mcs_spinlock *node) { } +static __always_inline u32 __pv_wait_head_or_lock(struct qspinlock *lock, + struct mcs_spinlock *node) + { return 0; } #define pv_enabled() false #define pv_init_node __pv_init_node #define pv_wait_node __pv_wait_node #define pv_kick_node __pv_kick_node -#define pv_wait_head __pv_wait_head +#define pv_wait_head_or_lock __pv_wait_head_or_lock #ifdef CONFIG_PARAVIRT_SPINLOCKS #define queued_spin_lock_slowpath native_queued_spin_lock_slowpath @@ -319,7 +331,11 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (val == new) new |= _Q_PENDING_VAL; - old = atomic_cmpxchg(&lock->val, val, new); + /* + * Acquire semantic is required here as the function may + * return immediately if the lock was free. + */ + old = atomic_cmpxchg_acquire(&lock->val, val, new); if (old == val) break; @@ -342,8 +358,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * sequentiality; this is because not all clear_pending_set_locked() * implementations imply full barriers. */ - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK) - cpu_relax(); + smp_cond_acquire(!(atomic_read(&lock->val) & _Q_LOCKED_MASK)); /* * take ownership and clear the pending bit. @@ -382,6 +397,7 @@ queue: * p,*,* -> n,*,* */ old = xchg_tail(lock, tail); + next = NULL; /* * if there was a previous node; link it and wait until reaching the @@ -391,8 +407,18 @@ queue: prev = decode_tail(old); WRITE_ONCE(prev->next, node); - pv_wait_node(node); + pv_wait_node(node, prev); arch_mcs_spin_lock_contended(&node->locked); + + /* + * While waiting for the MCS lock, the next pointer may have + * been set by another lock waiter. We optimistically load + * the next pointer & prefetch the cacheline for writing + * to reduce latency in the upcoming MCS unlock operation. + */ + next = READ_ONCE(node->next); + if (next) + prefetchw(next); } /* @@ -406,11 +432,22 @@ queue: * sequentiality; this is because the set_locked() function below * does not imply a full barrier. * + * The PV pv_wait_head_or_lock function, if active, will acquire + * the lock and return a non-zero value. So we have to skip the + * smp_cond_acquire() call. As the next PV queue head hasn't been + * designated yet, there is no way for the locked value to become + * _Q_SLOW_VAL. So both the set_locked() and the + * atomic_cmpxchg_relaxed() calls will be safe. + * + * If PV isn't active, 0 will be returned instead. + * */ - pv_wait_head(lock, node); - while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_PENDING_MASK) - cpu_relax(); + if ((val = pv_wait_head_or_lock(lock, node))) + goto locked; + + smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK)); +locked: /* * claim the lock: * @@ -422,11 +459,17 @@ queue: * to grab the lock. */ for (;;) { - if (val != tail) { + /* In the PV case we might already have _Q_LOCKED_VAL set */ + if ((val & _Q_TAIL_MASK) != tail) { set_locked(lock); break; } - old = atomic_cmpxchg(&lock->val, val, _Q_LOCKED_VAL); + /* + * The smp_cond_acquire() call above has provided the necessary + * acquire semantics required for locking. At most two + * iterations of this loop may be ran. + */ + old = atomic_cmpxchg_relaxed(&lock->val, val, _Q_LOCKED_VAL); if (old == val) goto release; /* No contention */ @@ -434,10 +477,12 @@ queue: } /* - * contended path; wait for next, release. + * contended path; wait for next if not observed yet, release. */ - while (!(next = READ_ONCE(node->next))) - cpu_relax(); + if (!next) { + while (!(next = READ_ONCE(node->next))) + cpu_relax(); + } arch_mcs_spin_unlock_contended(&next->locked); pv_kick_node(lock, next); @@ -462,7 +507,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath); #undef pv_init_node #undef pv_wait_node #undef pv_kick_node -#undef pv_wait_head +#undef pv_wait_head_or_lock #undef queued_spin_lock_slowpath #define queued_spin_lock_slowpath __pv_queued_spin_lock_slowpath diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h index f0450ff4829b..21ede57f68b3 100644 --- a/kernel/locking/qspinlock_paravirt.h +++ b/kernel/locking/qspinlock_paravirt.h @@ -23,6 +23,20 @@ #define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET) /* + * Queue Node Adaptive Spinning + * + * A queue node vCPU will stop spinning if the vCPU in the previous node is + * not running. The one lock stealing attempt allowed at slowpath entry + * mitigates the slight slowdown for non-overcommitted guest with this + * aggressive wait-early mechanism. + * + * The status of the previous node will be checked at fixed interval + * controlled by PV_PREV_CHECK_MASK. This is to ensure that we won't + * pound on the cacheline of the previous node too heavily. + */ +#define PV_PREV_CHECK_MASK 0xff + +/* * Queue node uses: vcpu_running & vcpu_halted. * Queue head uses: vcpu_running & vcpu_hashed. */ @@ -41,6 +55,96 @@ struct pv_node { }; /* + * Include queued spinlock statistics code + */ +#include "qspinlock_stat.h" + +/* + * By replacing the regular queued_spin_trylock() with the function below, + * it will be called once when a lock waiter enter the PV slowpath before + * being queued. By allowing one lock stealing attempt here when the pending + * bit is off, it helps to reduce the performance impact of lock waiter + * preemption without the drawback of lock starvation. + */ +#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l) +static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + int ret = !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) && + (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0); + + qstat_inc(qstat_pv_lock_stealing, ret); + return ret; +} + +/* + * The pending bit is used by the queue head vCPU to indicate that it + * is actively spinning on the lock and no lock stealing is allowed. + */ +#if _Q_PENDING_BITS == 8 +static __always_inline void set_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 1); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + WRITE_ONCE(l->pending, 0); +} + +/* + * The pending bit check in pv_queued_spin_steal_lock() isn't a memory + * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock + * just to be sure that it will get it. + */ +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + + return !READ_ONCE(l->locked) && + (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL) + == _Q_PENDING_VAL); +} +#else /* _Q_PENDING_BITS == 8 */ +static __always_inline void set_pending(struct qspinlock *lock) +{ + atomic_set_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline void clear_pending(struct qspinlock *lock) +{ + atomic_clear_mask(_Q_PENDING_VAL, &lock->val); +} + +static __always_inline int trylock_clear_pending(struct qspinlock *lock) +{ + int val = atomic_read(&lock->val); + + for (;;) { + int old, new; + + if (val & _Q_LOCKED_MASK) + break; + + /* + * Try to clear pending bit & set locked bit + */ + old = val; + new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL; + val = atomic_cmpxchg(&lock->val, old, new); + + if (val == old) + return 1; + } + return 0; +} +#endif /* _Q_PENDING_BITS == 8 */ + +/* * Lock and MCS node addresses hash table for fast lookup * * Hashing is done on a per-cacheline basis to minimize the need to access @@ -100,10 +204,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node) { unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits); struct pv_hash_entry *he; + int hopcnt = 0; for_each_hash_entry(he, offset, hash) { + hopcnt++; if (!cmpxchg(&he->lock, NULL, lock)) { WRITE_ONCE(he->node, node); + qstat_hop(hopcnt); return &he->lock; } } @@ -144,6 +251,20 @@ static struct pv_node *pv_unhash(struct qspinlock *lock) } /* + * Return true if when it is time to check the previous node which is not + * in a running state. + */ +static inline bool +pv_wait_early(struct pv_node *prev, int loop) +{ + + if ((loop & PV_PREV_CHECK_MASK) != 0) + return false; + + return READ_ONCE(prev->state) != vcpu_running; +} + +/* * Initialize the PV part of the mcs_spinlock node. */ static void pv_init_node(struct mcs_spinlock *node) @@ -161,15 +282,23 @@ static void pv_init_node(struct mcs_spinlock *node) * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its * behalf. */ -static void pv_wait_node(struct mcs_spinlock *node) +static void pv_wait_node(struct mcs_spinlock *node, struct mcs_spinlock *prev) { struct pv_node *pn = (struct pv_node *)node; + struct pv_node *pp = (struct pv_node *)prev; + int waitcnt = 0; int loop; + bool wait_early; - for (;;) { - for (loop = SPIN_THRESHOLD; loop; loop--) { + /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */ + for (;; waitcnt++) { + for (wait_early = false, loop = SPIN_THRESHOLD; loop; loop--) { if (READ_ONCE(node->locked)) return; + if (pv_wait_early(pp, loop)) { + wait_early = true; + break; + } cpu_relax(); } @@ -184,12 +313,17 @@ static void pv_wait_node(struct mcs_spinlock *node) */ smp_store_mb(pn->state, vcpu_halted); - if (!READ_ONCE(node->locked)) + if (!READ_ONCE(node->locked)) { + qstat_inc(qstat_pv_wait_node, true); + qstat_inc(qstat_pv_wait_again, waitcnt); + qstat_inc(qstat_pv_wait_early, wait_early); pv_wait(&pn->state, vcpu_halted); + } /* - * If pv_kick_node() changed us to vcpu_hashed, retain that value - * so that pv_wait_head() knows to not also try to hash this lock. + * If pv_kick_node() changed us to vcpu_hashed, retain that + * value so that pv_wait_head_or_lock() knows to not also try + * to hash this lock. */ cmpxchg(&pn->state, vcpu_halted, vcpu_running); @@ -200,6 +334,7 @@ static void pv_wait_node(struct mcs_spinlock *node) * So it is better to spin for a while in the hope that the * MCS lock will be released soon. */ + qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked)); } /* @@ -212,8 +347,9 @@ static void pv_wait_node(struct mcs_spinlock *node) /* * Called after setting next->locked = 1 when we're the lock owner. * - * Instead of waking the waiters stuck in pv_wait_node() advance their state such - * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle. + * Instead of waking the waiters stuck in pv_wait_node() advance their state + * such that they're waiting in pv_wait_head_or_lock(), this avoids a + * wake/sleep cycle. */ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) { @@ -242,14 +378,19 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node) } /* - * Wait for l->locked to become clear; halt the vcpu after a short spin. + * Wait for l->locked to become clear and acquire the lock; + * halt the vcpu after a short spin. * __pv_queued_spin_unlock() will wake us. + * + * The current value of the lock will be returned for additional processing. */ -static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) +static u32 +pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node) { struct pv_node *pn = (struct pv_node *)node; struct __qspinlock *l = (void *)lock; struct qspinlock **lp = NULL; + int waitcnt = 0; int loop; /* @@ -259,12 +400,30 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) if (READ_ONCE(pn->state) == vcpu_hashed) lp = (struct qspinlock **)1; - for (;;) { + /* + * Tracking # of slowpath locking operations + */ + qstat_inc(qstat_pv_lock_slowpath, true); + + for (;; waitcnt++) { + /* + * Set correct vCPU state to be used by queue node wait-early + * mechanism. + */ + WRITE_ONCE(pn->state, vcpu_running); + + /* + * Set the pending bit in the active lock spinning loop to + * disable lock stealing before attempting to acquire the lock. + */ + set_pending(lock); for (loop = SPIN_THRESHOLD; loop; loop--) { - if (!READ_ONCE(l->locked)) - return; + if (trylock_clear_pending(lock)) + goto gotlock; cpu_relax(); } + clear_pending(lock); + if (!lp) { /* ONCE */ lp = pv_hash(lock, pn); @@ -280,51 +439,50 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node) * * Matches the smp_rmb() in __pv_queued_spin_unlock(). */ - if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) { + if (xchg(&l->locked, _Q_SLOW_VAL) == 0) { /* - * The lock is free and _Q_SLOW_VAL has never - * been set. Therefore we need to unhash before - * getting the lock. + * The lock was free and now we own the lock. + * Change the lock value back to _Q_LOCKED_VAL + * and unhash the table. */ + WRITE_ONCE(l->locked, _Q_LOCKED_VAL); WRITE_ONCE(*lp, NULL); - return; + goto gotlock; } } + WRITE_ONCE(pn->state, vcpu_halted); + qstat_inc(qstat_pv_wait_head, true); + qstat_inc(qstat_pv_wait_again, waitcnt); pv_wait(&l->locked, _Q_SLOW_VAL); /* * The unlocker should have freed the lock before kicking the * CPU. So if the lock is still not free, it is a spurious - * wakeup and so the vCPU should wait again after spinning for - * a while. + * wakeup or another vCPU has stolen the lock. The current + * vCPU should spin again. */ + qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked)); } /* - * Lock is unlocked now; the caller will acquire it without waiting. - * As with pv_wait_node() we rely on the caller to do a load-acquire - * for us. + * The cmpxchg() or xchg() call before coming here provides the + * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL + * here is to indicate to the compiler that the value will always + * be nozero to enable better code optimization. */ +gotlock: + return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL); } /* - * PV version of the unlock function to be used in stead of - * queued_spin_unlock(). + * PV versions of the unlock fastpath and slowpath functions to be used + * instead of queued_spin_unlock(). */ -__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +__visible void +__pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked) { struct __qspinlock *l = (void *)lock; struct pv_node *node; - u8 locked; - - /* - * We must not unlock if SLOW, because in that case we must first - * unhash. Otherwise it would be possible to have multiple @lock - * entries, which would be BAD. - */ - locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); - if (likely(locked == _Q_LOCKED_VAL)) - return; if (unlikely(locked != _Q_SLOW_VAL)) { WARN(!debug_locks_silent, @@ -338,7 +496,7 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * so we need a barrier to order the read of the node data in * pv_unhash *after* we've read the lock being _Q_SLOW_VAL. * - * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL. + * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL. */ smp_rmb(); @@ -361,14 +519,35 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock) * vCPU is harmless other than the additional latency in completing * the unlock. */ + qstat_inc(qstat_pv_kick_unlock, true); pv_kick(node->cpu); } + /* * Include the architecture specific callee-save thunk of the * __pv_queued_spin_unlock(). This thunk is put together with - * __pv_queued_spin_unlock() near the top of the file to make sure - * that the callee-save thunk and the real unlock function are close - * to each other sharing consecutive instruction cachelines. + * __pv_queued_spin_unlock() to make the callee-save thunk and the real unlock + * function close to each other sharing consecutive instruction cachelines. + * Alternatively, architecture specific version of __pv_queued_spin_unlock() + * can be defined. */ #include <asm/qspinlock_paravirt.h> +#ifndef __pv_queued_spin_unlock +__visible void __pv_queued_spin_unlock(struct qspinlock *lock) +{ + struct __qspinlock *l = (void *)lock; + u8 locked; + + /* + * We must not unlock if SLOW, because in that case we must first + * unhash. Otherwise it would be possible to have multiple @lock + * entries, which would be BAD. + */ + locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0); + if (likely(locked == _Q_LOCKED_VAL)) + return; + + __pv_queued_spin_unlock_slowpath(lock, locked); +} +#endif /* __pv_queued_spin_unlock */ diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h new file mode 100644 index 000000000000..eb2a2c9bc3fc --- /dev/null +++ b/kernel/locking/qspinlock_stat.h @@ -0,0 +1,290 @@ +/* + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * Authors: Waiman Long <waiman.long@hpe.com> + */ + +/* + * When queued spinlock statistical counters are enabled, the following + * debugfs files will be created for reporting the counter values: + * + * <debugfs>/qlockstat/ + * pv_hash_hops - average # of hops per hashing operation + * pv_kick_unlock - # of vCPU kicks issued at unlock time + * pv_kick_wake - # of vCPU kicks used for computing pv_latency_wake + * pv_latency_kick - average latency (ns) of vCPU kick operation + * pv_latency_wake - average latency (ns) from vCPU kick to wakeup + * pv_lock_slowpath - # of locking operations via the slowpath + * pv_lock_stealing - # of lock stealing operations + * pv_spurious_wakeup - # of spurious wakeups + * pv_wait_again - # of vCPU wait's that happened after a vCPU kick + * pv_wait_early - # of early vCPU wait's + * pv_wait_head - # of vCPU wait's at the queue head + * pv_wait_node - # of vCPU wait's at a non-head queue node + * + * Writing to the "reset_counters" file will reset all the above counter + * values. + * + * These statistical counters are implemented as per-cpu variables which are + * summed and computed whenever the corresponding debugfs files are read. This + * minimizes added overhead making the counters usable even in a production + * environment. + * + * There may be slight difference between pv_kick_wake and pv_kick_unlock. + */ +enum qlock_stats { + qstat_pv_hash_hops, + qstat_pv_kick_unlock, + qstat_pv_kick_wake, + qstat_pv_latency_kick, + qstat_pv_latency_wake, + qstat_pv_lock_slowpath, + qstat_pv_lock_stealing, + qstat_pv_spurious_wakeup, + qstat_pv_wait_again, + qstat_pv_wait_early, + qstat_pv_wait_head, + qstat_pv_wait_node, + qstat_num, /* Total number of statistical counters */ + qstat_reset_cnts = qstat_num, +}; + +#ifdef CONFIG_QUEUED_LOCK_STAT +/* + * Collect pvqspinlock statistics + */ +#include <linux/debugfs.h> +#include <linux/sched.h> +#include <linux/fs.h> + +static const char * const qstat_names[qstat_num + 1] = { + [qstat_pv_hash_hops] = "pv_hash_hops", + [qstat_pv_kick_unlock] = "pv_kick_unlock", + [qstat_pv_kick_wake] = "pv_kick_wake", + [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup", + [qstat_pv_latency_kick] = "pv_latency_kick", + [qstat_pv_latency_wake] = "pv_latency_wake", + [qstat_pv_lock_slowpath] = "pv_lock_slowpath", + [qstat_pv_lock_stealing] = "pv_lock_stealing", + [qstat_pv_wait_again] = "pv_wait_again", + [qstat_pv_wait_early] = "pv_wait_early", + [qstat_pv_wait_head] = "pv_wait_head", + [qstat_pv_wait_node] = "pv_wait_node", + [qstat_reset_cnts] = "reset_counters", +}; + +/* + * Per-cpu counters + */ +static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]); +static DEFINE_PER_CPU(u64, pv_kick_time); + +/* + * Function to read and return the qlock statistical counter values + * + * The following counters are handled specially: + * 1. qstat_pv_latency_kick + * Average kick latency (ns) = pv_latency_kick/pv_kick_unlock + * 2. qstat_pv_latency_wake + * Average wake latency (ns) = pv_latency_wake/pv_kick_wake + * 3. qstat_pv_hash_hops + * Average hops/hash = pv_hash_hops/pv_kick_unlock + */ +static ssize_t qstat_read(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + char buf[64]; + int cpu, counter, len; + u64 stat = 0, kicks = 0; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if (!file->f_inode) { + WARN_ON_ONCE(1); + return -EBADF; + } + counter = (long)(file->f_inode->i_private); + + if (counter >= qstat_num) + return -EBADF; + + for_each_possible_cpu(cpu) { + stat += per_cpu(qstats[counter], cpu); + /* + * Need to sum additional counter for some of them + */ + switch (counter) { + + case qstat_pv_latency_kick: + case qstat_pv_hash_hops: + kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu); + break; + + case qstat_pv_latency_wake: + kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu); + break; + } + } + + if (counter == qstat_pv_hash_hops) { + u64 frac; + + frac = 100ULL * do_div(stat, kicks); + frac = DIV_ROUND_CLOSEST_ULL(frac, kicks); + + /* + * Return a X.XX decimal number + */ + len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac); + } else { + /* + * Round to the nearest ns + */ + if ((counter == qstat_pv_latency_kick) || + (counter == qstat_pv_latency_wake)) { + stat = 0; + if (kicks) + stat = DIV_ROUND_CLOSEST_ULL(stat, kicks); + } + len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat); + } + + return simple_read_from_buffer(user_buf, count, ppos, buf, len); +} + +/* + * Function to handle write request + * + * When counter = reset_cnts, reset all the counter values. + * Since the counter updates aren't atomic, the resetting is done twice + * to make sure that the counters are very likely to be all cleared. + */ +static ssize_t qstat_write(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + int cpu; + + /* + * Get the counter ID stored in file->f_inode->i_private + */ + if (!file->f_inode) { + WARN_ON_ONCE(1); + return -EBADF; + } + if ((long)(file->f_inode->i_private) != qstat_reset_cnts) + return count; + + for_each_possible_cpu(cpu) { + int i; + unsigned long *ptr = per_cpu_ptr(qstats, cpu); + + for (i = 0 ; i < qstat_num; i++) + WRITE_ONCE(ptr[i], 0); + for (i = 0 ; i < qstat_num; i++) + WRITE_ONCE(ptr[i], 0); + } + return count; +} + +/* + * Debugfs data structures + */ +static const struct file_operations fops_qstat = { + .read = qstat_read, + .write = qstat_write, + .llseek = default_llseek, +}; + +/* + * Initialize debugfs for the qspinlock statistical counters + */ +static int __init init_qspinlock_stat(void) +{ + struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL); + int i; + + if (!d_qstat) { + pr_warn("Could not create 'qlockstat' debugfs directory\n"); + return 0; + } + + /* + * Create the debugfs files + * + * As reading from and writing to the stat files can be slow, only + * root is allowed to do the read/write to limit impact to system + * performance. + */ + for (i = 0; i < qstat_num; i++) + debugfs_create_file(qstat_names[i], 0400, d_qstat, + (void *)(long)i, &fops_qstat); + + debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat, + (void *)(long)qstat_reset_cnts, &fops_qstat); + return 0; +} +fs_initcall(init_qspinlock_stat); + +/* + * Increment the PV qspinlock statistical counters + */ +static inline void qstat_inc(enum qlock_stats stat, bool cond) +{ + if (cond) + this_cpu_inc(qstats[stat]); +} + +/* + * PV hash hop count + */ +static inline void qstat_hop(int hopcnt) +{ + this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt); +} + +/* + * Replacement function for pv_kick() + */ +static inline void __pv_kick(int cpu) +{ + u64 start = sched_clock(); + + per_cpu(pv_kick_time, cpu) = start; + pv_kick(cpu); + this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start); +} + +/* + * Replacement function for pv_wait() + */ +static inline void __pv_wait(u8 *ptr, u8 val) +{ + u64 *pkick_time = this_cpu_ptr(&pv_kick_time); + + *pkick_time = 0; + pv_wait(ptr, val); + if (*pkick_time) { + this_cpu_add(qstats[qstat_pv_latency_wake], + sched_clock() - *pkick_time); + qstat_inc(qstat_pv_kick_wake, true); + } +} + +#define pv_kick(c) __pv_kick(c) +#define pv_wait(p, v) __pv_wait(p, v) + +#else /* CONFIG_QUEUED_LOCK_STAT */ + +static inline void qstat_inc(enum qlock_stats stat, bool cond) { } +static inline void qstat_hop(int hopcnt) { } + +#endif /* CONFIG_QUEUED_LOCK_STAT */ diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8251e75dd9c0..3e746607abe5 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); clear_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * If a new waiter comes in between the unlock and the cmpxchg * we have two situations: @@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; } #endif @@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int ret = 0, depth = 0; struct rt_mutex *lock; bool detect_deadlock; - unsigned long flags; bool requeue = true; detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); @@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * [1] Task cannot go away as we did a get_task() before ! */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irq(&task->pi_lock); /* * [2] Get the waiter on which @task is blocked on. @@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * operations. */ if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); cpu_relax(); goto retry; } @@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * No requeue[7] here. Just release @task [8] */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * If there is no owner of the lock, end of chain. */ if (!rt_mutex_owner(lock)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* * No requeue [11] here. We just do deadlock detection. @@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* If owner is not blocked, end of chain. */ if (!next_lock) @@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { @@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop the locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* * Make the actual exit decisions [12], based on the stored @@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); out_put_task: put_task_struct(task); @@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Try to take an rt-mutex * - * Must be called with lock->wait_lock held. + * Must be called with lock->wait_lock held and interrupts disabled * * @lock: The lock to be acquired. * @task: The task which wants to acquire the lock @@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { - unsigned long flags; - /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * case, but conditionals are more expensive than a redundant * store. */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); task->pi_blocked_on = NULL; /* * Finish the lock acquisition. @task is the new owner. If @@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); takeit: /* We got the lock. */ @@ -883,7 +882,7 @@ takeit: * * Prepare waiter and propagate pi chain * - * This must be called with lock->wait_lock held. + * This must be called with lock->wait_lock held and interrupts disabled */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *top_waiter = waiter; struct rt_mutex *next_lock; int chain_walk = 0, res; - unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (owner == task) return -EDEADLK; - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); if (!owner) return 0; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); @@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Even if full deadlock detection is on, if the owner is not * blocked itself, we can avoid finding this out in the chain @@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); return res; } @@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * Remove the top waiter from the current tasks pi waiter tree and * queue it up. * - * Called with lock->wait_lock held. + * Called with lock->wait_lock held and interrupts disabled. */ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); wake_q_add(wake_q, waiter->task); } @@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, /* * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held and + * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, @@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock, bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); /* * Only update priority if the waiter was the highest priority @@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock, if (!owner || !is_top_waiter) return; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); rt_mutex_dequeue_pi(owner, waiter); @@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Don't walk the chain, if the owner task is not blocked @@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, next_lock, NULL, current); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); } /* @@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task) * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * - * lock->wait_lock must be held by the caller. + * Must be called with lock->wait_lock held and interrupts disabled */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, @@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); schedule(); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); } @@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; + unsigned long flags; int ret = 0; debug_rt_mutex_init_waiter(&waiter); RB_CLEAR_NODE(&waiter.pi_tree_entry); RB_CLEAR_NODE(&waiter.tree_entry); - raw_spin_lock(&lock->wait_lock); + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } @@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret; /* @@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) return 0; /* - * The mutex has currently no owner. Lock the wait lock and - * try to acquire the lock. + * The mutex has currently no owner. Lock the wait lock and try to + * acquire the lock. We use irqsave here to support early boot calls. */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + + /* irqsave required to support early boot calls */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ - if (unlock_rt_mutex_safe(lock) == true) + if (unlock_rt_mutex_safe(lock, flags) == true) return false; /* Relock the rtmutex and try again */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } /* @@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* check PI boosting */ return true; @@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 1; } @@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return ret; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 9d6b55587eaa..a6d382312e6f 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -10,8 +10,11 @@ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. */ +#include <linux/radix-tree.h> +#include <linux/memremap.h> #include <linux/device.h> #include <linux/types.h> +#include <linux/pfn_t.h> #include <linux/io.h> #include <linux/mm.h> #include <linux/memory_hotplug.h> @@ -26,10 +29,10 @@ __weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) static void *try_ram_remap(resource_size_t offset, size_t size) { - struct page *page = pfn_to_page(offset >> PAGE_SHIFT); + unsigned long pfn = PHYS_PFN(offset); /* In the simple case just return the existing linear address */ - if (!PageHighMem(page)) + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn))) return __va(offset); return NULL; /* fallback to ioremap_cache */ } @@ -38,13 +41,15 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * memremap() - remap an iomem_resource as cacheable memory * @offset: iomem resource start address * @size: size of remap - * @flags: either MEMREMAP_WB or MEMREMAP_WT + * @flags: any of MEMREMAP_WB, MEMREMAP_WT and MEMREMAP_WC * * memremap() is "ioremap" for cases where it is known that the resource * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. * - * MEMREMAP_WB - matches the default mapping for "System RAM" on + * MEMREMAP_WB - matches the default mapping for System RAM on * the architecture. This is usually a read-allocate write-back cache. * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM * memremap() will bypass establishing a new mapping and instead return @@ -53,13 +58,21 @@ static void *try_ram_remap(resource_size_t offset, size_t size) * MEMREMAP_WT - establish a mapping whereby writes either bypass the * cache or are written through to memory and never exist in a * cache-dirty state with respect to program visibility. Attempts to - * map "System RAM" with this mapping type will fail. + * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. */ void *memremap(resource_size_t offset, size_t size, unsigned long flags) { - int is_ram = region_intersects(offset, size, "System RAM"); + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); void *addr = NULL; + if (!flags) + return NULL; + if (is_ram == REGION_MIXED) { WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", &offset, (unsigned long) size); @@ -68,12 +81,11 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) /* Try all mapping types requested until one returns non-NULL */ if (flags & MEMREMAP_WB) { - flags &= ~MEMREMAP_WB; /* * MEMREMAP_WB is special in that it can be satisifed * from the direct map. Some archs depend on the * capability of memremap() to autodetect cases where - * the requested range is potentially in "System RAM" + * the requested range is potentially in System RAM. */ if (is_ram == REGION_INTERSECTS) addr = try_ram_remap(offset, size); @@ -82,21 +94,22 @@ void *memremap(resource_size_t offset, size_t size, unsigned long flags) } /* - * If we don't have a mapping yet and more request flags are - * pending then we will be attempting to establish a new virtual + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual * address mapping. Enforce that this mapping is not aliasing - * "System RAM" + * System RAM. */ - if (!addr && is_ram == REGION_INTERSECTS && flags) { + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", &offset, (unsigned long) size); return NULL; } - if (!addr && (flags & MEMREMAP_WT)) { - flags &= ~MEMREMAP_WT; + if (!addr && (flags & MEMREMAP_WT)) addr = ioremap_wt(offset, size); - } + + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); return addr; } @@ -111,7 +124,7 @@ EXPORT_SYMBOL(memunmap); static void devm_memremap_release(struct device *dev, void *res) { - memunmap(res); + memunmap(*(void **)res); } static int devm_memremap_match(struct device *dev, void *res, void *match_data) @@ -124,16 +137,19 @@ void *devm_memremap(struct device *dev, resource_size_t offset, { void **ptr, *addr; - ptr = devres_alloc(devm_memremap_release, sizeof(*ptr), GFP_KERNEL); + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); if (!ptr) - return NULL; + return ERR_PTR(-ENOMEM); addr = memremap(offset, size, flags); if (addr) { *ptr = addr; devres_add(dev, ptr); - } else + } else { devres_free(ptr); + return ERR_PTR(-ENXIO); + } return addr; } @@ -141,31 +157,140 @@ EXPORT_SYMBOL(devm_memremap); void devm_memunmap(struct device *dev, void *addr) { - WARN_ON(devres_destroy(dev, devm_memremap_release, devm_memremap_match, - addr)); - memunmap(addr); + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); } EXPORT_SYMBOL(devm_memunmap); +pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) +{ + return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); +} +EXPORT_SYMBOL(phys_to_pfn_t); + #ifdef CONFIG_ZONE_DEVICE +static DEFINE_MUTEX(pgmap_lock); +static RADIX_TREE(pgmap_radix, GFP_KERNEL); +#define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) +#define SECTION_SIZE (1UL << PA_SECTION_SHIFT) + struct page_map { struct resource res; + struct percpu_ref *ref; + struct dev_pagemap pgmap; + struct vmem_altmap altmap; }; -static void devm_memremap_pages_release(struct device *dev, void *res) +void get_zone_device_page(struct page *page) +{ + percpu_ref_get(page->pgmap->ref); +} +EXPORT_SYMBOL(get_zone_device_page); + +void put_zone_device_page(struct page *page) +{ + put_dev_pagemap(page->pgmap); +} +EXPORT_SYMBOL(put_zone_device_page); + +static void pgmap_radix_release(struct resource *res) { - struct page_map *page_map = res; + resource_size_t key, align_start, align_size, align_end; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; + + mutex_lock(&pgmap_lock); + for (key = res->start; key <= res->end; key += SECTION_SIZE) + radix_tree_delete(&pgmap_radix, key >> PA_SECTION_SHIFT); + mutex_unlock(&pgmap_lock); +} + +static unsigned long pfn_first(struct page_map *page_map) +{ + struct dev_pagemap *pgmap = &page_map->pgmap; + const struct resource *res = &page_map->res; + struct vmem_altmap *altmap = pgmap->altmap; + unsigned long pfn; + + pfn = res->start >> PAGE_SHIFT; + if (altmap) + pfn += vmem_altmap_offset(altmap); + return pfn; +} + +static unsigned long pfn_end(struct page_map *page_map) +{ + const struct resource *res = &page_map->res; + + return (res->start + resource_size(res)) >> PAGE_SHIFT; +} + +#define for_each_device_pfn(pfn, map) \ + for (pfn = pfn_first(map); pfn < pfn_end(map); pfn++) + +static void devm_memremap_pages_release(struct device *dev, void *data) +{ + struct page_map *page_map = data; + struct resource *res = &page_map->res; + resource_size_t align_start, align_size; + struct dev_pagemap *pgmap = &page_map->pgmap; + + if (percpu_ref_tryget_live(pgmap->ref)) { + dev_WARN(dev, "%s: page mapping is still live!\n", __func__); + percpu_ref_put(pgmap->ref); + } /* pages are dead and unused, undo the arch mapping */ - arch_remove_memory(page_map->res.start, resource_size(&page_map->res)); + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + arch_remove_memory(align_start, align_size); + pgmap_radix_release(res); + dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, + "%s: failed to free all reserved pages\n", __func__); +} + +/* assumes rcu_read_lock() held at entry */ +struct dev_pagemap *find_dev_pagemap(resource_size_t phys) +{ + struct page_map *page_map; + + WARN_ON_ONCE(!rcu_read_lock_held()); + + page_map = radix_tree_lookup(&pgmap_radix, phys >> PA_SECTION_SHIFT); + return page_map ? &page_map->pgmap : NULL; } -void *devm_memremap_pages(struct device *dev, struct resource *res) +/** + * devm_memremap_pages - remap and provide memmap backing for the given resource + * @dev: hosting device for @res + * @res: "host memory" address range + * @ref: a live per-cpu reference count + * @altmap: optional descriptor for allocating the memmap from @res + * + * Notes: + * 1/ @ref must be 'live' on entry and 'dead' before devm_memunmap_pages() time + * (or devm release event). + * + * 2/ @res is expected to be a host memory range that could feasibly be + * treated as a "System RAM" range, i.e. not a device mmio range, but + * this is not enforced. + */ +void *devm_memremap_pages(struct device *dev, struct resource *res, + struct percpu_ref *ref, struct vmem_altmap *altmap) { - int is_ram = region_intersects(res->start, resource_size(res), - "System RAM"); + resource_size_t key, align_start, align_size, align_end; + struct dev_pagemap *pgmap; struct page_map *page_map; - int error, nid; + int error, nid, is_ram; + unsigned long pfn; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(res->start + resource_size(res), SECTION_SIZE) + - align_start; + is_ram = region_intersects(align_start, align_size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); if (is_ram == REGION_MIXED) { WARN_ONCE(1, "%s attempted on mixed region %pr\n", @@ -176,25 +301,124 @@ void *devm_memremap_pages(struct device *dev, struct resource *res) if (is_ram == REGION_INTERSECTS) return __va(res->start); - page_map = devres_alloc(devm_memremap_pages_release, - sizeof(*page_map), GFP_KERNEL); + if (altmap && !IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP)) { + dev_err(dev, "%s: altmap requires CONFIG_SPARSEMEM_VMEMMAP=y\n", + __func__); + return ERR_PTR(-ENXIO); + } + + if (!ref) + return ERR_PTR(-EINVAL); + + page_map = devres_alloc_node(devm_memremap_pages_release, + sizeof(*page_map), GFP_KERNEL, dev_to_node(dev)); if (!page_map) return ERR_PTR(-ENOMEM); + pgmap = &page_map->pgmap; memcpy(&page_map->res, res, sizeof(*res)); + pgmap->dev = dev; + if (altmap) { + memcpy(&page_map->altmap, altmap, sizeof(*altmap)); + pgmap->altmap = &page_map->altmap; + } + pgmap->ref = ref; + pgmap->res = &page_map->res; + + mutex_lock(&pgmap_lock); + error = 0; + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += SECTION_SIZE) { + struct dev_pagemap *dup; + + rcu_read_lock(); + dup = find_dev_pagemap(key); + rcu_read_unlock(); + if (dup) { + dev_err(dev, "%s: %pr collides with mapping for %s\n", + __func__, res, dev_name(dup->dev)); + error = -EBUSY; + break; + } + error = radix_tree_insert(&pgmap_radix, key >> PA_SECTION_SHIFT, + page_map); + if (error) { + dev_err(dev, "%s: failed: %d\n", __func__, error); + break; + } + } + mutex_unlock(&pgmap_lock); + if (error) + goto err_radix; + nid = dev_to_node(dev); if (nid < 0) - nid = 0; + nid = numa_mem_id(); - error = arch_add_memory(nid, res->start, resource_size(res), true); - if (error) { - devres_free(page_map); - return ERR_PTR(error); - } + error = arch_add_memory(nid, align_start, align_size, true); + if (error) + goto err_add_memory; + + for_each_device_pfn(pfn, page_map) { + struct page *page = pfn_to_page(pfn); + /* + * ZONE_DEVICE pages union ->lru with a ->pgmap back + * pointer. It is a bug if a ZONE_DEVICE page is ever + * freed or placed on a driver-private list. Seed the + * storage with LIST_POISON* values. + */ + list_del(&page->lru); + page->pgmap = pgmap; + } devres_add(dev, page_map); return __va(res->start); + + err_add_memory: + err_radix: + pgmap_radix_release(res); + devres_free(page_map); + return ERR_PTR(error); } EXPORT_SYMBOL(devm_memremap_pages); + +unsigned long vmem_altmap_offset(struct vmem_altmap *altmap) +{ + /* number of pfns from base where pfn_to_page() is valid */ + return altmap->reserve + altmap->free; +} + +void vmem_altmap_free(struct vmem_altmap *altmap, unsigned long nr_pfns) +{ + altmap->alloc -= nr_pfns; +} + +#ifdef CONFIG_SPARSEMEM_VMEMMAP +struct vmem_altmap *to_vmem_altmap(unsigned long memmap_start) +{ + /* + * 'memmap_start' is the virtual address for the first "struct + * page" in this range of the vmemmap array. In the case of + * CONFIG_SPARSEMEM_VMEMMAP a page_to_pfn conversion is simple + * pointer arithmetic, so we can perform this to_vmem_altmap() + * conversion without concern for the initialization state of + * the struct page fields. + */ + struct page *page = (struct page *) memmap_start; + struct dev_pagemap *pgmap; + + /* + * Unconditionally retrieve a dev_pagemap associated with the + * given physical address, this is only for use in the + * arch_{add|remove}_memory() for setting up and tearing down + * the memmap. + */ + rcu_read_lock(); + pgmap = find_dev_pagemap(__pfn_to_phys(page_to_pfn(page))); + rcu_read_unlock(); + + return pgmap ? pgmap->altmap : NULL; +} +#endif /* CONFIG_SPARSEMEM_VMEMMAP */ #endif /* CONFIG_ZONE_DEVICE */ diff --git a/kernel/module.c b/kernel/module.c index 8f051a106676..041200ca4a2d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -53,6 +53,7 @@ #include <asm/sections.h> #include <linux/tracepoint.h> #include <linux/ftrace.h> +#include <linux/livepatch.h> #include <linux/async.h> #include <linux/percpu.h> #include <linux/kmemleak.h> @@ -80,15 +81,6 @@ # define debug_align(X) (X) #endif -/* - * Given BASE and SIZE this macro calculates the number of pages the - * memory regions occupies - */ -#define MOD_NUMBER_OF_PAGES(BASE, SIZE) (((SIZE) > 0) ? \ - (PFN_DOWN((unsigned long)(BASE) + (SIZE) - 1) - \ - PFN_DOWN((unsigned long)BASE) + 1) \ - : (0UL)) - /* If this is set, the section belongs in the init part of the module */ #define INIT_OFFSET_MASK (1UL << (BITS_PER_LONG-1)) @@ -108,13 +100,6 @@ static LIST_HEAD(modules); * Use a latched RB-tree for __module_address(); this allows us to use * RCU-sched lookups of the address from any context. * - * Because modules have two address ranges: init and core, we need two - * latch_tree_nodes entries. Therefore we need the back-pointer from - * mod_tree_node. - * - * Because init ranges are short lived we mark them unlikely and have placed - * them outside the critical cacheline in struct module. - * * This is conditional on PERF_EVENTS || TRACING because those can really hit * __module_address() hard by doing a lot of stack unwinding; potentially from * NMI context. @@ -122,24 +107,16 @@ static LIST_HEAD(modules); static __always_inline unsigned long __mod_tree_val(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; - - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->module_init; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - return (unsigned long)mod->module_core; + return (unsigned long)layout->base; } static __always_inline unsigned long __mod_tree_size(struct latch_tree_node *n) { - struct mod_tree_node *mtn = container_of(n, struct mod_tree_node, node); - struct module *mod = mtn->mod; - - if (unlikely(mtn == &mod->mtn_init)) - return (unsigned long)mod->init_size; + struct module_layout *layout = container_of(n, struct module_layout, mtn.node); - return (unsigned long)mod->core_size; + return (unsigned long)layout->size; } static __always_inline bool @@ -197,23 +174,23 @@ static void __mod_tree_remove(struct mod_tree_node *node) */ static void mod_tree_insert(struct module *mod) { - mod->mtn_core.mod = mod; - mod->mtn_init.mod = mod; + mod->core_layout.mtn.mod = mod; + mod->init_layout.mtn.mod = mod; - __mod_tree_insert(&mod->mtn_core); - if (mod->init_size) - __mod_tree_insert(&mod->mtn_init); + __mod_tree_insert(&mod->core_layout.mtn); + if (mod->init_layout.size) + __mod_tree_insert(&mod->init_layout.mtn); } static void mod_tree_remove_init(struct module *mod) { - if (mod->init_size) - __mod_tree_remove(&mod->mtn_init); + if (mod->init_layout.size) + __mod_tree_remove(&mod->init_layout.mtn); } static void mod_tree_remove(struct module *mod) { - __mod_tree_remove(&mod->mtn_core); + __mod_tree_remove(&mod->core_layout.mtn); mod_tree_remove_init(mod); } @@ -267,9 +244,9 @@ static void __mod_update_bounds(void *base, unsigned int size) static void mod_update_bounds(struct module *mod) { - __mod_update_bounds(mod->module_core, mod->core_size); - if (mod->init_size) - __mod_update_bounds(mod->module_init, mod->init_size); + __mod_update_bounds(mod->core_layout.base, mod->core_layout.size); + if (mod->init_layout.size) + __mod_update_bounds(mod->init_layout.base, mod->init_layout.size); } #ifdef CONFIG_KGDB_KDB @@ -327,6 +304,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -1005,6 +985,9 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); + ftrace_release_mod(mod); + async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ @@ -1214,7 +1197,7 @@ struct module_attribute module_uevent = static ssize_t show_coresize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->core_size); + return sprintf(buffer, "%u\n", mk->mod->core_layout.size); } static struct module_attribute modinfo_coresize = @@ -1223,7 +1206,7 @@ static struct module_attribute modinfo_coresize = static ssize_t show_initsize(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%u\n", mk->mod->init_size); + return sprintf(buffer, "%u\n", mk->mod->init_layout.size); } static struct module_attribute modinfo_initsize = @@ -1873,64 +1856,75 @@ static void mod_sysfs_teardown(struct module *mod) /* * LKM RO/NX protection: protect module's text/ro-data * from modification and any data from execution. + * + * General layout of module is: + * [text] [read-only-data] [writable data] + * text_size -----^ ^ ^ + * ro_size ------------------------| | + * size -------------------------------------------| + * + * These values are always page-aligned (as is base) */ -void set_page_attributes(void *start, void *end, int (*set)(unsigned long start, int num_pages)) +static void frob_text(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - unsigned long begin_pfn = PFN_DOWN((unsigned long)start); - unsigned long end_pfn = PFN_DOWN((unsigned long)end); + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base, + layout->text_size >> PAGE_SHIFT); +} - if (end_pfn > begin_pfn) - set(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); +static void frob_rodata(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) +{ + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->text_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->text_size, + (layout->ro_size - layout->text_size) >> PAGE_SHIFT); } -static void set_section_ro_nx(void *base, - unsigned long text_size, - unsigned long ro_size, - unsigned long total_size) +static void frob_writable_data(const struct module_layout *layout, + int (*set_memory)(unsigned long start, int num_pages)) { - /* begin and end PFNs of the current subsection */ - unsigned long begin_pfn; - unsigned long end_pfn; + BUG_ON((unsigned long)layout->base & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->ro_size & (PAGE_SIZE-1)); + BUG_ON((unsigned long)layout->size & (PAGE_SIZE-1)); + set_memory((unsigned long)layout->base + layout->ro_size, + (layout->size - layout->ro_size) >> PAGE_SHIFT); +} - /* - * Set RO for module text and RO-data: - * - Always protect first page. - * - Do not protect last partial page. - */ - if (ro_size > 0) - set_page_attributes(base, base + ro_size, set_memory_ro); +/* livepatching wants to disable read-only so it can frob module. */ +void module_disable_ro(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); + frob_rodata(&mod->init_layout, set_memory_rw); +} - /* - * Set NX permissions for module data: - * - Do not protect first partial page. - * - Always protect last page. - */ - if (total_size > text_size) { - begin_pfn = PFN_UP((unsigned long)base + text_size); - end_pfn = PFN_UP((unsigned long)base + total_size); - if (end_pfn > begin_pfn) - set_memory_nx(begin_pfn << PAGE_SHIFT, end_pfn - begin_pfn); - } +void module_enable_ro(const struct module *mod) +{ + frob_text(&mod->core_layout, set_memory_ro); + frob_rodata(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); + frob_rodata(&mod->init_layout, set_memory_ro); } -static void unset_module_core_ro_nx(struct module *mod) +static void module_enable_nx(const struct module *mod) { - set_page_attributes(mod->module_core + mod->core_text_size, - mod->module_core + mod->core_size, - set_memory_x); - set_page_attributes(mod->module_core, - mod->module_core + mod->core_ro_size, - set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_nx); + frob_writable_data(&mod->core_layout, set_memory_nx); + frob_rodata(&mod->init_layout, set_memory_nx); + frob_writable_data(&mod->init_layout, set_memory_nx); } -static void unset_module_init_ro_nx(struct module *mod) +static void module_disable_nx(const struct module *mod) { - set_page_attributes(mod->module_init + mod->init_text_size, - mod->module_init + mod->init_size, - set_memory_x); - set_page_attributes(mod->module_init, - mod->module_init + mod->init_ro_size, - set_memory_rw); + frob_rodata(&mod->core_layout, set_memory_x); + frob_writable_data(&mod->core_layout, set_memory_x); + frob_rodata(&mod->init_layout, set_memory_x); + frob_writable_data(&mod->init_layout, set_memory_x); } /* Iterate through all modules and set each module's text as RW */ @@ -1942,16 +1936,9 @@ void set_all_modules_text_rw(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, - set_memory_rw); - } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, - set_memory_rw); - } + + frob_text(&mod->core_layout, set_memory_rw); + frob_text(&mod->init_layout, set_memory_rw); } mutex_unlock(&module_mutex); } @@ -1965,23 +1952,25 @@ void set_all_modules_text_ro(void) list_for_each_entry_rcu(mod, &modules, list) { if (mod->state == MODULE_STATE_UNFORMED) continue; - if ((mod->module_core) && (mod->core_text_size)) { - set_page_attributes(mod->module_core, - mod->module_core + mod->core_text_size, - set_memory_ro); - } - if ((mod->module_init) && (mod->init_text_size)) { - set_page_attributes(mod->module_init, - mod->module_init + mod->init_text_size, - set_memory_ro); - } + + frob_text(&mod->core_layout, set_memory_ro); + frob_text(&mod->init_layout, set_memory_ro); } mutex_unlock(&module_mutex); } + +static void disable_ro_nx(const struct module_layout *layout) +{ + frob_text(layout, set_memory_rw); + frob_rodata(layout, set_memory_rw); + frob_rodata(layout, set_memory_x); + frob_writable_data(layout, set_memory_x); +} + #else -static inline void set_section_ro_nx(void *base, unsigned long text_size, unsigned long ro_size, unsigned long total_size) { } -static void unset_module_core_ro_nx(struct module *mod) { } -static void unset_module_init_ro_nx(struct module *mod) { } +static void disable_ro_nx(const struct module_layout *layout) { } +static void module_enable_nx(const struct module *mod) { } +static void module_disable_nx(const struct module *mod) { } #endif void __weak module_memfree(void *module_region) @@ -2033,19 +2022,19 @@ static void free_module(struct module *mod) synchronize_sched(); mutex_unlock(&module_mutex); - /* This may be NULL, but that's OK */ - unset_module_init_ro_nx(mod); + /* This may be empty, but that's OK */ + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); - module_memfree(mod->module_init); + module_memfree(mod->init_layout.base); kfree(mod->args); percpu_modfree(mod); /* Free lock-classes; relies on the preceding sync_rcu(). */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); /* Finally, free the core (containing the module structure) */ - unset_module_core_ro_nx(mod); - module_memfree(mod->module_core); + disable_ro_nx(&mod->core_layout); + module_memfree(mod->core_layout.base); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2248,20 +2237,20 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || strstarts(sname, ".init")) continue; - s->sh_entsize = get_offset(mod, &mod->core_size, s, i); + s->sh_entsize = get_offset(mod, &mod->core_layout.size, s, i); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->core_size = debug_align(mod->core_size); - mod->core_text_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.text_size = mod->core_layout.size; break; case 1: /* RO: text and ro-data */ - mod->core_size = debug_align(mod->core_size); - mod->core_ro_size = mod->core_size; + mod->core_layout.size = debug_align(mod->core_layout.size); + mod->core_layout.ro_size = mod->core_layout.size; break; case 3: /* whole core */ - mod->core_size = debug_align(mod->core_size); + mod->core_layout.size = debug_align(mod->core_layout.size); break; } } @@ -2277,21 +2266,21 @@ static void layout_sections(struct module *mod, struct load_info *info) || s->sh_entsize != ~0UL || !strstarts(sname, ".init")) continue; - s->sh_entsize = (get_offset(mod, &mod->init_size, s, i) + s->sh_entsize = (get_offset(mod, &mod->init_layout.size, s, i) | INIT_OFFSET_MASK); pr_debug("\t%s\n", sname); } switch (m) { case 0: /* executable */ - mod->init_size = debug_align(mod->init_size); - mod->init_text_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.text_size = mod->init_layout.size; break; case 1: /* RO: text and ro-data */ - mod->init_size = debug_align(mod->init_size); - mod->init_ro_size = mod->init_size; + mod->init_layout.size = debug_align(mod->init_layout.size); + mod->init_layout.ro_size = mod->init_layout.size; break; case 3: /* whole init */ - mod->init_size = debug_align(mod->init_size); + mod->init_layout.size = debug_align(mod->init_layout.size); break; } } @@ -2401,7 +2390,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } if (sym->st_shndx == SHN_UNDEF) return 'U'; - if (sym->st_shndx == SHN_ABS) + if (sym->st_shndx == SHN_ABS || sym->st_shndx == info->index.pcpu) return 'a'; if (sym->st_shndx >= SHN_LORESERVE) return '?'; @@ -2430,7 +2419,7 @@ static char elf_type(const Elf_Sym *sym, const struct load_info *info) } static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, - unsigned int shnum) + unsigned int shnum, unsigned int pcpundx) { const Elf_Shdr *sec; @@ -2439,6 +2428,11 @@ static bool is_core_symbol(const Elf_Sym *src, const Elf_Shdr *sechdrs, || !src->st_name) return false; +#ifdef CONFIG_KALLSYMS_ALL + if (src->st_shndx == pcpundx) + return true; +#endif + sec = sechdrs + src->st_shndx; if (!(sec->sh_flags & SHF_ALLOC) #ifndef CONFIG_KALLSYMS_ALL @@ -2466,7 +2460,7 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Put symbol section at end of init part of module. */ symsect->sh_flags |= SHF_ALLOC; - symsect->sh_entsize = get_offset(mod, &mod->init_size, symsect, + symsect->sh_entsize = get_offset(mod, &mod->init_layout.size, symsect, info->index.sym) | INIT_OFFSET_MASK; pr_debug("\t%s\n", info->secstrings + symsect->sh_name); @@ -2476,26 +2470,38 @@ static void layout_symtab(struct module *mod, struct load_info *info) /* Compute total space required for the core symbols' strtab. */ for (ndst = i = 0; i < nsrc; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } } /* Append room for core symbols at end of core part. */ - info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); - info->stroffs = mod->core_size = info->symoffs + ndst * sizeof(Elf_Sym); - mod->core_size += strtab_size; - mod->core_size = debug_align(mod->core_size); + info->symoffs = ALIGN(mod->core_layout.size, symsect->sh_addralign ?: 1); + info->stroffs = mod->core_layout.size = info->symoffs + ndst * sizeof(Elf_Sym); + mod->core_layout.size += strtab_size; + mod->core_layout.size = debug_align(mod->core_layout.size); /* Put string table section at end of init part of module. */ strsect->sh_flags |= SHF_ALLOC; - strsect->sh_entsize = get_offset(mod, &mod->init_size, strsect, + strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_size = debug_align(mod->init_size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + mod->init_layout.size = debug_align(mod->init_layout.size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2504,28 +2510,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->module_core + info->symoffs; - mod->core_strtab = s = mod->module_core + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || - is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, + info->index.pcpu)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -2665,7 +2677,7 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, if (info->len < sizeof(*(info->hdr))) return -ENOEXEC; - err = security_kernel_module_from_file(NULL); + err = security_kernel_read_file(NULL, READING_MODULE); if (err) return err; @@ -2683,63 +2695,6 @@ static int copy_module_from_user(const void __user *umod, unsigned long len, return 0; } -/* Sets info->hdr and info->len. */ -static int copy_module_from_fd(int fd, struct load_info *info) -{ - struct fd f = fdget(fd); - int err; - struct kstat stat; - loff_t pos; - ssize_t bytes = 0; - - if (!f.file) - return -ENOEXEC; - - err = security_kernel_module_from_file(f.file); - if (err) - goto out; - - err = vfs_getattr(&f.file->f_path, &stat); - if (err) - goto out; - - if (stat.size > INT_MAX) { - err = -EFBIG; - goto out; - } - - /* Don't hand 0 to vmalloc, it whines. */ - if (stat.size == 0) { - err = -EINVAL; - goto out; - } - - info->hdr = vmalloc(stat.size); - if (!info->hdr) { - err = -ENOMEM; - goto out; - } - - pos = 0; - while (pos < stat.size) { - bytes = kernel_read(f.file, pos, (char *)(info->hdr) + pos, - stat.size - pos); - if (bytes < 0) { - vfree(info->hdr); - err = bytes; - goto out; - } - if (bytes == 0) - break; - pos += bytes; - } - info->len = pos; - -out: - fdput(f); - return err; -} - static void free_copy(struct load_info *info) { vfree(info->hdr); @@ -2964,7 +2919,7 @@ static int move_module(struct module *mod, struct load_info *info) void *ptr; /* Do the allocs. */ - ptr = module_alloc(mod->core_size); + ptr = module_alloc(mod->core_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. Just mark it as not being a @@ -2974,11 +2929,11 @@ static int move_module(struct module *mod, struct load_info *info) if (!ptr) return -ENOMEM; - memset(ptr, 0, mod->core_size); - mod->module_core = ptr; + memset(ptr, 0, mod->core_layout.size); + mod->core_layout.base = ptr; - if (mod->init_size) { - ptr = module_alloc(mod->init_size); + if (mod->init_layout.size) { + ptr = module_alloc(mod->init_layout.size); /* * The pointer to this block is stored in the module structure * which is inside the block. This block doesn't need to be @@ -2987,13 +2942,13 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_memfree(mod->module_core); + module_memfree(mod->core_layout.base); return -ENOMEM; } - memset(ptr, 0, mod->init_size); - mod->module_init = ptr; + memset(ptr, 0, mod->init_layout.size); + mod->init_layout.base = ptr; } else - mod->module_init = NULL; + mod->init_layout.base = NULL; /* Transfer each section which specifies SHF_ALLOC */ pr_debug("final section addresses:\n"); @@ -3005,10 +2960,10 @@ static int move_module(struct module *mod, struct load_info *info) continue; if (shdr->sh_entsize & INIT_OFFSET_MASK) - dest = mod->module_init + dest = mod->init_layout.base + (shdr->sh_entsize & ~INIT_OFFSET_MASK); else - dest = mod->module_core + shdr->sh_entsize; + dest = mod->core_layout.base + shdr->sh_entsize; if (shdr->sh_type != SHT_NOBITS) memcpy(dest, (void *)shdr->sh_addr, shdr->sh_size); @@ -3070,12 +3025,12 @@ static void flush_module_icache(const struct module *mod) * Do it before processing of module parameters, so the module * can provide parameter accessor functions of its own. */ - if (mod->module_init) - flush_icache_range((unsigned long)mod->module_init, - (unsigned long)mod->module_init - + mod->init_size); - flush_icache_range((unsigned long)mod->module_core, - (unsigned long)mod->module_core + mod->core_size); + if (mod->init_layout.base) + flush_icache_range((unsigned long)mod->init_layout.base, + (unsigned long)mod->init_layout.base + + mod->init_layout.size); + flush_icache_range((unsigned long)mod->core_layout.base, + (unsigned long)mod->core_layout.base + mod->core_layout.size); set_fs(old_fs); } @@ -3133,8 +3088,8 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); - module_memfree(mod->module_core); + module_memfree(mod->init_layout.base); + module_memfree(mod->core_layout.base); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3221,7 +3176,7 @@ static noinline int do_init_module(struct module *mod) ret = -ENOMEM; goto fail; } - freeinit->module_init = mod->module_init; + freeinit->module_init = mod->init_layout.base; /* * We want to find out whether @mod uses async during init. Clear @@ -3274,17 +3229,16 @@ static noinline int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif mod_tree_remove_init(mod); - unset_module_init_ro_nx(mod); + disable_ro_nx(&mod->init_layout); module_arch_freeing_init(mod); - mod->module_init = NULL; - mod->init_size = 0; - mod->init_ro_size = 0; - mod->init_text_size = 0; + mod->init_layout.base = NULL; + mod->init_layout.size = 0; + mod->init_layout.ro_size = 0; + mod->init_layout.text_size = 0; /* * We want to free module_init, but be aware that kallsyms may be * walking this with preempt disabled. In all the failure paths, we @@ -3306,6 +3260,8 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + klp_module_going(mod); + ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); return ret; @@ -3373,25 +3329,15 @@ static int complete_formation(struct module *mod, struct load_info *info) /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - /* Set RO and NX regions for core */ - set_section_ro_nx(mod->module_core, - mod->core_text_size, - mod->core_ro_size, - mod->core_size); - - /* Set RO and NX regions for init */ - set_section_ro_nx(mod->module_init, - mod->init_text_size, - mod->init_ro_size, - mod->init_size); + /* Set RO and NX regions */ + module_enable_ro(mod); + module_enable_nx(mod); /* Mark state as coming so strong_try_module_get() ignores us, * but kallsyms etc. can see us. */ mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_COMING, mod); return 0; out: @@ -3399,6 +3345,20 @@ out: return err; } +static int prepare_coming_module(struct module *mod) +{ + int err; + + ftrace_module_enable(mod); + err = klp_module_coming(mod); + if (err) + return err; + + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_COMING, mod); + return 0; +} + static int unknown_module_param_cb(char *param, char *val, const char *modname, void *arg) { @@ -3513,13 +3473,17 @@ static int load_module(struct load_info *info, const char __user *uargs, if (err) goto ddebug_cleanup; + err = prepare_coming_module(mod); + if (err) + goto bug_cleanup; + /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL, + -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); - goto bug_cleanup; + goto coming_cleanup; } else if (after_dashes) { pr_warn("%s: parameters '%s' after `--' ignored\n", mod->name, after_dashes); @@ -3528,7 +3492,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto bug_cleanup; + goto coming_cleanup; /* Get rid of temporary copy. */ free_copy(info); @@ -3538,18 +3502,20 @@ static int load_module(struct load_info *info, const char __user *uargs, return do_init_module(mod); + coming_cleanup: + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + klp_module_going(mod); + bug_cleanup: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); mutex_unlock(&module_mutex); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - /* we can't deallocate the module until we clear memory protection */ - unset_module_init_ro_nx(mod); - unset_module_core_ro_nx(mod); + module_disable_ro(mod); + module_disable_nx(mod); ddebug_cleanup: dynamic_debug_remove(info->debug); @@ -3571,8 +3537,14 @@ static int load_module(struct load_info *info, const char __user *uargs, synchronize_sched(); mutex_unlock(&module_mutex); free_module: + /* + * Ftrace needs to clean up what it initialized. + * This does nothing if ftrace_module_init() wasn't called, + * but it must be called outside of module_mutex. + */ + ftrace_release_mod(mod); /* Free lock-classes; relies on the preceding sync_rcu() */ - lockdep_free_key_range(mod->module_core, mod->core_size); + lockdep_free_key_range(mod->core_layout.base, mod->core_layout.size); module_deallocate(mod, info); free_copy: @@ -3602,8 +3574,10 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) { - int err; struct load_info info = { }; + loff_t size; + void *hdr; + int err; err = may_init_module(); if (err) @@ -3615,9 +3589,12 @@ SYSCALL_DEFINE3(finit_module, int, fd, const char __user *, uargs, int, flags) |MODULE_INIT_IGNORE_VERMAGIC)) return -EINVAL; - err = copy_module_from_fd(fd, &info); + err = kernel_read_file_from_fd(fd, &hdr, &size, INT_MAX, + READING_MODULE); if (err) return err; + info.hdr = hdr; + info.len = size; return load_module(&info, uargs, flags); } @@ -3640,6 +3617,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3647,41 +3629,42 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) - nextval = (unsigned long)mod->module_init+mod->init_text_size; + nextval = (unsigned long)mod->init_layout.base+mod->init_layout.text_size; else - nextval = (unsigned long)mod->module_core+mod->core_text_size; + nextval = (unsigned long)mod->core_layout.base+mod->core_layout.text_size; /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) + continue; + + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3771,19 +3754,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3792,11 +3777,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3835,11 +3821,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, module_assert_mutex(); list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } @@ -3899,7 +3888,7 @@ static int m_show(struct seq_file *m, void *p) return 0; seq_printf(m, "%s %u", - mod->name, mod->init_size + mod->core_size); + mod->name, mod->init_layout.size + mod->core_layout.size); print_unload_info(m, mod); /* Informative for users. */ @@ -3908,7 +3897,7 @@ static int m_show(struct seq_file *m, void *p) mod->state == MODULE_STATE_COMING ? "Loading" : "Live"); /* Used by oprofile and other similar tools. */ - seq_printf(m, " 0x%pK", mod->module_core); + seq_printf(m, " 0x%pK", mod->core_layout.base); /* Taints info */ if (mod->taints) @@ -4051,8 +4040,8 @@ struct module *__module_text_address(unsigned long addr) struct module *mod = __module_address(addr); if (mod) { /* Make sure it's within the text section. */ - if (!within(addr, mod->module_init, mod->init_text_size) - && !within(addr, mod->module_core, mod->core_text_size)) + if (!within(addr, mod->init_layout.base, mod->init_layout.text_size) + && !within(addr, mod->core_layout.base, mod->core_layout.text_size)) mod = NULL; } return mod; diff --git a/kernel/module_signing.c b/kernel/module_signing.c index 6528a79d998d..64b9dead4a07 100644 --- a/kernel/module_signing.c +++ b/kernel/module_signing.c @@ -11,10 +11,17 @@ #include <linux/kernel.h> #include <linux/errno.h> +#include <linux/string.h> #include <keys/system_keyring.h> #include <crypto/public_key.h> #include "module-internal.h" +enum pkey_id_type { + PKEY_ID_PGP, /* OpenPGP generated key ID */ + PKEY_ID_X509, /* X.509 arbitrary subjectKeyIdentifier */ + PKEY_ID_PKCS7, /* Signature in PKCS#7 message */ +}; + /* * Module signature information block. * diff --git a/kernel/nsproxy.c b/kernel/nsproxy.c index 49746c81ad8d..782102e59eed 100644 --- a/kernel/nsproxy.c +++ b/kernel/nsproxy.c @@ -25,6 +25,7 @@ #include <linux/proc_ns.h> #include <linux/file.h> #include <linux/syscalls.h> +#include <linux/cgroup.h> static struct kmem_cache *nsproxy_cachep; @@ -39,6 +40,9 @@ struct nsproxy init_nsproxy = { #ifdef CONFIG_NET .net_ns = &init_net, #endif +#ifdef CONFIG_CGROUPS + .cgroup_ns = &init_cgroup_ns, +#endif }; static inline struct nsproxy *create_nsproxy(void) @@ -92,6 +96,13 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, goto out_pid; } + new_nsp->cgroup_ns = copy_cgroup_ns(flags, user_ns, + tsk->nsproxy->cgroup_ns); + if (IS_ERR(new_nsp->cgroup_ns)) { + err = PTR_ERR(new_nsp->cgroup_ns); + goto out_cgroup; + } + new_nsp->net_ns = copy_net_ns(flags, user_ns, tsk->nsproxy->net_ns); if (IS_ERR(new_nsp->net_ns)) { err = PTR_ERR(new_nsp->net_ns); @@ -101,6 +112,8 @@ static struct nsproxy *create_new_namespaces(unsigned long flags, return new_nsp; out_net: + put_cgroup_ns(new_nsp->cgroup_ns); +out_cgroup: if (new_nsp->pid_ns_for_children) put_pid_ns(new_nsp->pid_ns_for_children); out_pid: @@ -128,7 +141,8 @@ int copy_namespaces(unsigned long flags, struct task_struct *tsk) struct nsproxy *new_ns; if (likely(!(flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWPID | CLONE_NEWNET)))) { + CLONE_NEWPID | CLONE_NEWNET | + CLONE_NEWCGROUP)))) { get_nsproxy(old_ns); return 0; } @@ -165,6 +179,7 @@ void free_nsproxy(struct nsproxy *ns) put_ipc_ns(ns->ipc_ns); if (ns->pid_ns_for_children) put_pid_ns(ns->pid_ns_for_children); + put_cgroup_ns(ns->cgroup_ns); put_net(ns->net_ns); kmem_cache_free(nsproxy_cachep, ns); } @@ -180,7 +195,7 @@ int unshare_nsproxy_namespaces(unsigned long unshare_flags, int err = 0; if (!(unshare_flags & (CLONE_NEWNS | CLONE_NEWUTS | CLONE_NEWIPC | - CLONE_NEWNET | CLONE_NEWPID))) + CLONE_NEWNET | CLONE_NEWPID | CLONE_NEWCGROUP))) return 0; user_ns = new_cred ? new_cred->user_ns : current_user_ns(); diff --git a/kernel/panic.c b/kernel/panic.c index 04e91ff7560b..535c96510a44 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -23,6 +23,8 @@ #include <linux/sysrq.h> #include <linux/init.h> #include <linux/nmi.h> +#include <linux/console.h> +#include <linux/bug.h> #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 @@ -60,6 +62,37 @@ void __weak panic_smp_self_stop(void) cpu_relax(); } +/* + * Stop ourselves in NMI context if another CPU has already panicked. Arch code + * may override this to prepare for crash dumping, e.g. save regs info. + */ +void __weak nmi_panic_self_stop(struct pt_regs *regs) +{ + panic_smp_self_stop(); +} + +atomic_t panic_cpu = ATOMIC_INIT(PANIC_CPU_INVALID); + +/* + * A variant of panic() called from NMI context. We return if we've already + * panicked on this CPU. If another CPU already panicked, loop in + * nmi_panic_self_stop() which can provide architecture dependent code such + * as saving register state for crash dump. + */ +void nmi_panic(struct pt_regs *regs, const char *msg) +{ + int old_cpu, cpu; + + cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, cpu); + + if (old_cpu == PANIC_CPU_INVALID) + panic("%s", msg); + else if (old_cpu != cpu) + nmi_panic_self_stop(regs); +} +EXPORT_SYMBOL(nmi_panic); + /** * panic - halt the system * @fmt: The text string to print @@ -70,17 +103,17 @@ void __weak panic_smp_self_stop(void) */ void panic(const char *fmt, ...) { - static DEFINE_SPINLOCK(panic_lock); static char buf[1024]; va_list args; long i, i_next = 0; int state = 0; + int old_cpu, this_cpu; /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since * there is nothing to prevent an interrupt handler (that runs - * after the panic_lock is acquired) from invoking panic again. + * after setting panic_cpu) from invoking panic() again. */ local_irq_disable(); @@ -93,8 +126,16 @@ void panic(const char *fmt, ...) * multiple parallel invocations of panic, all other CPUs either * stop themself or will wait until they are stopped by the 1st CPU * with smp_send_stop(). + * + * `old_cpu == PANIC_CPU_INVALID' means this is the 1st CPU which + * comes here, so go ahead. + * `old_cpu == this_cpu' means we came from nmi_panic() which sets + * panic_cpu to this CPU. In this case, this is also the 1st CPU. */ - if (!spin_trylock(&panic_lock)) + this_cpu = raw_smp_processor_id(); + old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); + + if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) panic_smp_self_stop(); console_verbose(); @@ -116,9 +157,11 @@ void panic(const char *fmt, ...) * everything else. * If we want to run this after calling panic_notifiers, pass * the "crash_kexec_post_notifiers" option to the kernel. + * + * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!crash_kexec_post_notifiers) - crash_kexec(NULL); + __crash_kexec(NULL); /* * Note smp_send_stop is the usual smp shutdown function, which @@ -141,12 +184,25 @@ void panic(const char *fmt, ...) * panic_notifiers and dumping kmsg before kdump. * Note: since some panic_notifiers can make crashed kernel * more unstable, it can increase risks of the kdump failure too. + * + * Bypass the panic_cpu check and call __crash_kexec directly. */ if (crash_kexec_post_notifiers) - crash_kexec(NULL); + __crash_kexec(NULL); bust_spinlocks(0); + /* + * We may have ended up stopping the CPU holding the lock (in + * smp_send_stop()) while still having some valuable data in the console + * buffer. Try to acquire the lock then release it regardless of the + * result. The release will also print the buffers out. Locks debug + * should be disabled to avoid reporting bad unlock balance when + * panic() is not being callled from OOPS. + */ + debug_locks_off(); + console_flush_on_panic(); + if (!panic_blink) panic_blink = no_blink; @@ -414,20 +470,25 @@ void oops_exit(void) kmsg_dump(KMSG_DUMP_OOPS); } -#ifdef WANT_WARN_ON_SLOWPATH -struct slowpath_args { +struct warn_args { const char *fmt; va_list args; }; -static void warn_slowpath_common(const char *file, int line, void *caller, - unsigned taint, struct slowpath_args *args) +void __warn(const char *file, int line, void *caller, unsigned taint, + struct pt_regs *regs, struct warn_args *args) { disable_trace_on_warning(); pr_warn("------------[ cut here ]------------\n"); - pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS()\n", - raw_smp_processor_id(), current->pid, file, line, caller); + + if (file) + pr_warn("WARNING: CPU: %d PID: %d at %s:%d %pS\n", + raw_smp_processor_id(), current->pid, file, line, + caller); + else + pr_warn("WARNING: CPU: %d PID: %d at %pS\n", + raw_smp_processor_id(), current->pid, caller); if (args) vprintk(args->fmt, args->args); @@ -444,20 +505,27 @@ static void warn_slowpath_common(const char *file, int line, void *caller, } print_modules(); - dump_stack(); + + if (regs) + show_regs(regs); + else + dump_stack(); + print_oops_end_marker(); + /* Just a warning, don't kill lockdep. */ add_taint(taint, LOCKDEP_STILL_OK); } +#ifdef WANT_WARN_ON_SLOWPATH void warn_slowpath_fmt(const char *file, int line, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, &args); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, + &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt); @@ -465,20 +533,18 @@ EXPORT_SYMBOL(warn_slowpath_fmt); void warn_slowpath_fmt_taint(const char *file, int line, unsigned taint, const char *fmt, ...) { - struct slowpath_args args; + struct warn_args args; args.fmt = fmt; va_start(args.args, fmt); - warn_slowpath_common(file, line, __builtin_return_address(0), - taint, &args); + __warn(file, line, __builtin_return_address(0), taint, NULL, &args); va_end(args.args); } EXPORT_SYMBOL(warn_slowpath_fmt_taint); void warn_slowpath_null(const char *file, int line) { - warn_slowpath_common(file, line, __builtin_return_address(0), - TAINT_WARN, NULL); + __warn(file, line, __builtin_return_address(0), TAINT_WARN, NULL, NULL); } EXPORT_SYMBOL(warn_slowpath_null); #endif diff --git a/kernel/params.c b/kernel/params.c index b6554aa71094..a6d6149c0fe6 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -223,7 +223,7 @@ char *parse_args(const char *doing, int (*unknown)(char *param, char *val, const char *doing, void *arg)) { - char *param, *val; + char *param, *val, *err = NULL; /* Chew leading spaces */ args = skip_spaces(args); @@ -238,7 +238,7 @@ char *parse_args(const char *doing, args = next_arg(args, ¶m, &val); /* Stop at -- */ if (!val && strcmp(param, "--") == 0) - return args; + return err ?: args; irq_was_disabled = irqs_disabled(); ret = parse_one(param, val, doing, params, num, min_level, max_level, arg, unknown); @@ -247,24 +247,25 @@ char *parse_args(const char *doing, doing, param); switch (ret) { + case 0: + continue; case -ENOENT: pr_err("%s: Unknown parameter `%s'\n", doing, param); - return ERR_PTR(ret); + break; case -ENOSPC: pr_err("%s: `%s' too large for parameter `%s'\n", doing, val ?: "", param); - return ERR_PTR(ret); - case 0: break; default: pr_err("%s: `%s' invalid for parameter `%s'\n", doing, val ?: "", param); - return ERR_PTR(ret); + break; } + + err = ERR_PTR(ret); } - /* All parsed OK. */ - return NULL; + return err; } /* Lazy bastard, eh? */ @@ -325,10 +326,11 @@ int param_get_charp(char *buffer, const struct kernel_param *kp) } EXPORT_SYMBOL(param_get_charp); -static void param_free_charp(void *arg) +void param_free_charp(void *arg) { maybe_kfree_parameter(*((char **)arg)); } +EXPORT_SYMBOL(param_free_charp); const struct kernel_param_ops param_ops_charp = { .set = param_set_charp, diff --git a/kernel/pid.c b/kernel/pid.c index ca368793808e..4d73a834c7e6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -467,7 +467,7 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type) rcu_read_lock(); if (type != PIDTYPE_PID) task = task->group_leader; - pid = get_pid(task->pids[type].pid); + pid = get_pid(rcu_dereference(task->pids[type].pid)); rcu_read_unlock(); return pid; } @@ -528,7 +528,7 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (likely(pid_alive(task))) { if (type != PIDTYPE_PID) task = task->group_leader; - nr = pid_nr_ns(task->pids[type].pid, ns); + nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -588,7 +588,7 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - /* Veryify no one has done anything silly */ + /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); /* bump default and minimum pid_max based on number of cpus */ @@ -604,5 +604,5 @@ void __init pidmap_init(void) atomic_dec(&init_pid_ns.pidmap[0].nr_free); init_pid_ns.pid_cachep = KMEM_CACHE(pid, - SLAB_HWCACHE_ALIGN | SLAB_PANIC); + SLAB_HWCACHE_ALIGN | SLAB_PANIC | SLAB_ACCOUNT); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 02e8dfaa1ce2..68d3ebc12601 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -235,7 +235,7 @@ config PM_TRACE_RTC config APM_EMULATION tristate "Advanced Power Management Emulation" - depends on PM && SYS_SUPPORTS_APM_EMULATION + depends on SYS_SUPPORTS_APM_EMULATION help APM is a BIOS specification for saving power using several different techniques. This is mostly useful for battery powered laptops with diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..fca9254280ee 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode) pm_message_t msg; int error; + pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; @@ -1158,6 +1159,22 @@ static int __init kaslr_nohibernate_setup(char *str) return nohibernate_setup(str); } +static int __init page_poison_nohibernate_setup(char *str) +{ +#ifdef CONFIG_PAGE_POISONING_ZERO + /* + * The zeroing option for page poison skips the checks on alloc. + * since hibernation doesn't save free pages there's no way to + * guarantee the pages will still be zeroed. + */ + if (!strcmp(str, "on")) { + pr_info("Disabling hibernation due to page poisoning\n"); + return nohibernate_setup(str); + } +#endif + return 1; +} + __setup("noresume", noresume_setup); __setup("resume_offset=", resume_offset_setup); __setup("resume=", resume_setup); @@ -1166,3 +1183,4 @@ __setup("resumewait", resumewait_setup); __setup("resumedelay=", resumedelay_setup); __setup("nohibernate", nohibernate_setup); __setup("kaslr", kaslr_nohibernate_setup); +__setup("page_poison=", page_poison_nohibernate_setup); diff --git a/kernel/power/main.c b/kernel/power/main.c index b2dd4d999900..27946975eff0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -280,13 +280,7 @@ static ssize_t pm_wakeup_irq_show(struct kobject *kobj, return pm_wakeup_irq ? sprintf(buf, "%u\n", pm_wakeup_irq) : -ENODATA; } -static ssize_t pm_wakeup_irq_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} -power_attr(pm_wakeup_irq); +power_attr_ro(pm_wakeup_irq); #else /* !CONFIG_PM_SLEEP_DEBUG */ static inline void pm_print_times_init(void) {} @@ -564,14 +558,7 @@ static ssize_t pm_trace_dev_match_show(struct kobject *kobj, return show_trace_dev_match(buf, PAGE_SIZE); } -static ssize_t -pm_trace_dev_match_store(struct kobject *kobj, struct kobj_attribute *attr, - const char *buf, size_t n) -{ - return -EINVAL; -} - -power_attr(pm_trace_dev_match); +power_attr_ro(pm_trace_dev_match); #endif /* CONFIG_PM_TRACE */ diff --git a/kernel/power/power.h b/kernel/power/power.h index caadb566e82b..efe1b3b17c88 100644 --- a/kernel/power/power.h +++ b/kernel/power/power.h @@ -77,6 +77,15 @@ static struct kobj_attribute _name##_attr = { \ .store = _name##_store, \ } +#define power_attr_ro(_name) \ +static struct kobj_attribute _name##_attr = { \ + .attr = { \ + .name = __stringify(_name), \ + .mode = S_IRUGO, \ + }, \ + .show = _name##_show, \ +} + /* Preferred image size in bytes (default 500 MB) */ extern unsigned long image_size; /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */ diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..df058bed53ce 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -30,13 +30,12 @@ static int try_to_freeze_tasks(bool user_only) unsigned long end_time; unsigned int todo; bool wq_busy = false; - struct timeval start, end; - u64 elapsed_msecs64; + ktime_t start, end, elapsed; unsigned int elapsed_msecs; bool wakeup = false; int sleep_usecs = USEC_PER_MSEC; - do_gettimeofday(&start); + start = ktime_get_boottime(); end_time = jiffies + msecs_to_jiffies(freeze_timeout_msecs); @@ -78,10 +77,9 @@ static int try_to_freeze_tasks(bool user_only) sleep_usecs *= 2; } - do_gettimeofday(&end); - elapsed_msecs64 = timeval_to_ns(&end) - timeval_to_ns(&start); - do_div(elapsed_msecs64, NSEC_PER_MSEC); - elapsed_msecs = elapsed_msecs64; + end = ktime_get_boottime(); + elapsed = ktime_sub(end, start); + elapsed_msecs = ktime_to_ms(elapsed); if (todo) { pr_cont("\n"); diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5235dd4e1e2f..3a970604308f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1779,7 +1779,7 @@ alloc_highmem_pages(struct memory_bitmap *bm, unsigned int nr_highmem) while (to_alloc-- > 0) { struct page *page; - page = alloc_image_page(__GFP_HIGHMEM); + page = alloc_image_page(__GFP_HIGHMEM|__GFP_KSWAPD_RECLAIM); memory_bm_set_bit(bm, page_to_pfn(page)); } return nr_highmem; diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c index f9fe133c13e2..5b70d64b871e 100644 --- a/kernel/power/suspend.c +++ b/kernel/power/suspend.c @@ -248,7 +248,7 @@ static int suspend_test(int level) { #ifdef CONFIG_PM_DEBUG if (pm_test_level == level) { - printk(KERN_INFO "suspend debug: Waiting for %d second(s).\n", + pr_info("suspend debug: Waiting for %d second(s).\n", pm_test_delay); mdelay(pm_test_delay * 1000); return 1; @@ -320,7 +320,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_late(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: late suspend of devices failed\n"); + pr_err("PM: late suspend of devices failed\n"); goto Platform_finish; } error = platform_suspend_prepare_late(state); @@ -329,7 +329,7 @@ static int suspend_enter(suspend_state_t state, bool *wakeup) error = dpm_suspend_noirq(PMSG_SUSPEND); if (error) { - printk(KERN_ERR "PM: noirq suspend of devices failed\n"); + pr_err("PM: noirq suspend of devices failed\n"); goto Platform_early_resume; } error = platform_suspend_prepare_noirq(state); @@ -473,8 +473,7 @@ static int enter_state(suspend_state_t state) if (state == PM_SUSPEND_FREEZE) { #ifdef CONFIG_PM_DEBUG if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) { - pr_warning("PM: Unsupported test mode for suspend to idle," - "please choose none/freezer/devices/platform.\n"); + pr_warn("PM: Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n"); return -EAGAIN; } #endif diff --git a/kernel/power/swap.c b/kernel/power/swap.c index b2066fb5b10f..12cd989dadf6 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -257,7 +257,7 @@ static int hib_submit_io(int rw, pgoff_t page_off, void *addr, struct bio *bio; int error = 0; - bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1); + bio = bio_alloc(__GFP_RECLAIM | __GFP_HIGH, 1); bio->bi_iter.bi_sector = page_off * (PAGE_SIZE >> 9); bio->bi_bdev = hib_resume_bdev; @@ -356,7 +356,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) return -ENOSPC; if (hb) { - src = (void *)__get_free_page(__GFP_WAIT | __GFP_NOWARN | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { copy_page(src, buf); @@ -364,7 +364,7 @@ static int write_page(void *buf, sector_t offset, struct hib_bio_batch *hb) ret = hib_wait_io(hb); /* Free pages */ if (ret) return ret; - src = (void *)__get_free_page(__GFP_WAIT | + src = (void *)__get_free_page(__GFP_RECLAIM | __GFP_NOWARN | __GFP_NORETRY); if (src) { @@ -672,7 +672,7 @@ static int save_image_lzo(struct swap_map_handle *handle, nr_threads = num_online_cpus() - 1; nr_threads = clamp_val(nr_threads, 1, LZO_THREADS); - page = (void *)__get_free_page(__GFP_WAIT | __GFP_HIGH); + page = (void *)__get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!page) { printk(KERN_ERR "PM: Failed to allocate LZO page\n"); ret = -ENOMEM; @@ -975,7 +975,7 @@ static int get_swap_reader(struct swap_map_handle *handle, last = tmp; tmp->map = (struct swap_map_page *) - __get_free_page(__GFP_WAIT | __GFP_HIGH); + __get_free_page(__GFP_RECLAIM | __GFP_HIGH); if (!tmp->map) { release_swap_reader(handle); return -ENOMEM; @@ -1242,9 +1242,9 @@ static int load_image_lzo(struct swap_map_handle *handle, for (i = 0; i < read_pages; i++) { page[i] = (void *)__get_free_page(i < LZO_CMP_PAGES ? - __GFP_WAIT | __GFP_HIGH : - __GFP_WAIT | __GFP_NOWARN | - __GFP_NORETRY); + __GFP_RECLAIM | __GFP_HIGH : + __GFP_RECLAIM | __GFP_NOWARN | + __GFP_NORETRY); if (!page[i]) { if (i < LZO_CMP_PAGES) { diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index b16f35487b67..bfbf284e4218 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -48,6 +48,7 @@ #include <linux/uio.h> #include <asm/uaccess.h> +#include <asm-generic/sections.h> #define CREATE_TRACE_POINTS #include <trace/events/printk.h> @@ -232,7 +233,11 @@ struct printk_log { u8 facility; /* syslog facility */ u8 flags:5; /* internal record flags */ u8 level:3; /* syslog level */ -}; +} +#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS +__packed __aligned(4) +#endif +; /* * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken @@ -269,12 +274,11 @@ static u32 clear_idx; #define PREFIX_MAX 32 #define LOG_LINE_MAX (1024 - PREFIX_MAX) +#define LOG_LEVEL(v) ((v) & 0x07) +#define LOG_FACILITY(v) ((v) >> 3 & 0xff) + /* record buffer */ -#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) -#define LOG_ALIGN 4 -#else #define LOG_ALIGN __alignof__(struct printk_log) -#endif #define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) static char __log_buf[__LOG_BUF_LEN] __aligned(LOG_ALIGN); static char *log_buf = __log_buf; @@ -363,16 +367,20 @@ static int logbuf_has_space(u32 msg_size, bool empty) static int log_make_free_space(u32 msg_size) { - while (log_first_seq < log_next_seq) { - if (logbuf_has_space(msg_size, false)) - return 0; + while (log_first_seq < log_next_seq && + !logbuf_has_space(msg_size, false)) { /* drop old messages until we have enough contiguous space */ log_first_idx = log_next(log_first_idx); log_first_seq++; } + if (clear_seq < log_first_seq) { + clear_seq = log_first_seq; + clear_idx = log_first_idx; + } + /* sequence numbers are equal, so the log buffer is empty */ - if (logbuf_has_space(msg_size, true)) + if (logbuf_has_space(msg_size, log_first_seq == log_next_seq)) return 0; return -ENOMEM; @@ -612,7 +620,6 @@ struct devkmsg_user { static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) { char *buf, *line; - int i; int level = default_message_loglevel; int facility = 1; /* LOG_USER */ size_t len = iov_iter_count(from); @@ -642,12 +649,13 @@ static ssize_t devkmsg_write(struct kiocb *iocb, struct iov_iter *from) line = buf; if (line[0] == '<') { char *endp = NULL; + unsigned int u; - i = simple_strtoul(line+1, &endp, 10); + u = simple_strtoul(line + 1, &endp, 10); if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; + level = LOG_LEVEL(u); + if (LOG_FACILITY(u) != 0) + facility = LOG_FACILITY(u); endp++; len -= endp - line; line = endp; @@ -850,6 +858,7 @@ void log_buf_kexec_setup(void) VMCOREINFO_SYMBOL(log_buf); VMCOREINFO_SYMBOL(log_buf_len); VMCOREINFO_SYMBOL(log_first_idx); + VMCOREINFO_SYMBOL(clear_idx); VMCOREINFO_SYMBOL(log_next_idx); /* * Export struct printk_log size and field offsets. User space tools can @@ -1212,12 +1221,6 @@ static int syslog_print_all(char __user *buf, int size, bool clear) u32 idx; enum log_flags prev; - if (clear_seq < log_first_seq) { - /* messages are gone, move to first available one */ - clear_seq = log_first_seq; - clear_idx = log_first_idx; - } - /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. @@ -1479,58 +1482,6 @@ static void zap_locks(void) sema_init(&console_sem, 1); } -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if (con->flags & CON_ANYTIME) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. - */ -static inline int can_use_console(unsigned int cpu) -{ - return cpu_online(cpu) || have_callable_console(); -} - -/* - * Try to get console ownership to actually show the kernel - * messages from a 'printk'. Return true (and with the - * console_lock held, and 'console_locked' set) if it - * is successful, false otherwise. - */ -static int console_trylock_for_printk(void) -{ - unsigned int cpu = smp_processor_id(); - - if (!console_trylock()) - return 0; - /* - * If we can't use the console, we need to release the console - * semaphore by hand to avoid flushing the buffer. We need to hold the - * console semaphore in order to do this test safely. - */ - if (!can_use_console(cpu)) { - console_locked = 0; - up_console_sem(); - return 0; - } - return 1; -} - int printk_delay_msec __read_mostly; static inline void printk_delay(void) @@ -1657,7 +1608,7 @@ asmlinkage int vprintk_emit(int facility, int level, const char *dict, size_t dictlen, const char *fmt, va_list args) { - static int recursion_bug; + static bool recursion_bug; static char textbuf[LOG_LINE_MAX]; char *text = textbuf; size_t text_len = 0; @@ -1677,7 +1628,6 @@ asmlinkage int vprintk_emit(int facility, int level, boot_delay_msec(level); printk_delay(); - /* This stops the holder of console_sem just where we want him */ local_irq_save(flags); this_cpu = smp_processor_id(); @@ -1693,7 +1643,7 @@ asmlinkage int vprintk_emit(int facility, int level, * it can be printed at the next appropriate moment: */ if (!oops_in_progress && !lockdep_recursing(current)) { - recursion_bug = 1; + recursion_bug = true; local_irq_restore(flags); return 0; } @@ -1701,6 +1651,7 @@ asmlinkage int vprintk_emit(int facility, int level, } lockdep_off(); + /* This stops the holder of console_sem just where we want him */ raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; @@ -1708,7 +1659,7 @@ asmlinkage int vprintk_emit(int facility, int level, static const char recursion_msg[] = "BUG: recent printk recursion!"; - recursion_bug = 0; + recursion_bug = false; /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, NULL, 0, recursion_msg, @@ -1806,20 +1757,12 @@ asmlinkage int vprintk_emit(int facility, int level, if (!in_sched) { lockdep_off(); /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console - */ - preempt_disable(); - - /* * Try to acquire and then immediately release the console * semaphore. The release will print out buffers and wake up * /dev/kmsg and syslog() users. */ - if (console_trylock_for_printk()) + if (console_trylock()) console_unlock(); - preempt_enable(); lockdep_on(); } @@ -2170,7 +2113,20 @@ int console_trylock(void) return 0; } console_locked = 1; - console_may_schedule = 0; + /* + * When PREEMPT_COUNT disabled we can't reliably detect if it's + * safe to schedule (e.g. calling printk while holding a spin_lock), + * because preempt_disable()/preempt_enable() are just barriers there + * and preempt_count() is always 0. + * + * RCU read sections have a separate preemption counter when + * PREEMPT_RCU enabled thus we must take extra care and check + * rcu_preempt_depth(), otherwise RCU read sections modify + * preempt_count(). + */ + console_may_schedule = !oops_in_progress && + preemptible() && + !rcu_preempt_depth(); return 1; } EXPORT_SYMBOL(console_trylock); @@ -2180,6 +2136,34 @@ int is_console_locked(void) return console_locked; } +/* + * Check if we have any console that is capable of printing while cpu is + * booting or shutting down. Requires console_sem. + */ +static int have_callable_console(void) +{ + struct console *con; + + for_each_console(con) + if ((con->flags & CON_ENABLED) && + (con->flags & CON_ANYTIME)) + return 1; + + return 0; +} + +/* + * Can we actually use the console at this time on this cpu? + * + * Console drivers may assume that per-cpu resources have been allocated. So + * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't + * call them until this CPU is officially up. + */ +static inline int can_use_console(void) +{ + return cpu_online(raw_smp_processor_id()) || have_callable_console(); +} + static void console_cont_flush(char *text, size_t size) { unsigned long flags; @@ -2230,18 +2214,41 @@ void console_unlock(void) static u64 seen_seq; unsigned long flags; bool wake_klogd = false; - bool retry; + bool do_cond_resched, retry; if (console_suspended) { up_console_sem(); return; } + /* + * Console drivers are called under logbuf_lock, so + * @console_may_schedule should be cleared before; however, we may + * end up dumping a lot of lines, for example, if called from + * console registration path, and should invoke cond_resched() + * between lines if allowable. Not doing so can cause a very long + * scheduling stall on a slow console leading to RCU stall and + * softlockup warnings which exacerbate the issue with more + * messages practically incapacitating the system. + */ + do_cond_resched = console_may_schedule; console_may_schedule = 0; +again: + /* + * We released the console_sem lock, so we need to recheck if + * cpu is online and (if not) is there at least one CON_ANYTIME + * console. + */ + if (!can_use_console()) { + console_locked = 0; + up_console_sem(); + return; + } + /* flush buffered message fragment immediately to console */ console_cont_flush(text, sizeof(text)); -again: + for (;;) { struct printk_log *msg; size_t ext_len = 0; @@ -2308,6 +2315,9 @@ skip: call_console_drivers(level, ext_text, ext_len, text, len); start_critical_timings(); local_irq_restore(flags); + + if (do_cond_resched) + cond_resched(); } console_locked = 0; @@ -2375,6 +2385,25 @@ void console_unblank(void) console_unlock(); } +/** + * console_flush_on_panic - flush console content on panic + * + * Immediately output all pending messages no matter what. + */ +void console_flush_on_panic(void) +{ + /* + * If someone else is holding the console lock, trylock will fail + * and may_schedule may be set. Ignore and proceed to unlock so + * that messages are flushed out. As this can be called from any + * context and we don't want to get preempted while flushing, + * ensure may_schedule is cleared. + */ + console_trylock(); + console_may_schedule = 0; + console_unlock(); +} + /* * Return the console tty driver structure and its associated index */ @@ -2655,13 +2684,36 @@ int unregister_console(struct console *console) } EXPORT_SYMBOL(unregister_console); +/* + * Some boot consoles access data that is in the init section and which will + * be discarded after the initcalls have been run. To make sure that no code + * will access this data, unregister the boot consoles in a late initcall. + * + * If for some reason, such as deferred probe or the driver being a loadable + * module, the real console hasn't registered yet at this point, there will + * be a brief interval in which no messages are logged to the console, which + * makes it difficult to diagnose problems that occur during this time. + * + * To mitigate this problem somewhat, only unregister consoles whose memory + * intersects with the init section. Note that code exists elsewhere to get + * rid of the boot console as soon as the proper console shows up, so there + * won't be side-effects from postponing the removal. + */ static int __init printk_late_init(void) { struct console *con; for_each_console(con) { if (!keep_bootcon && con->flags & CON_BOOT) { - unregister_console(con); + /* + * Make sure to unregister boot consoles whose data + * resides in the init section before the init section + * is discarded. Boot consoles whose data will stick + * around will automatically be unregistered when the + * proper console replaces them. + */ + if (init_section_intersects(con, sizeof(*con))) + unregister_console(con); } } hotcpu_notifier(console_cpu_notify, 0); diff --git a/kernel/profile.c b/kernel/profile.c index 99513e1160e5..c2199e9901c9 100644 --- a/kernel/profile.c +++ b/kernel/profile.c @@ -44,7 +44,7 @@ int prof_on __read_mostly; EXPORT_SYMBOL_GPL(prof_on); static cpumask_var_t prof_cpu_mask; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits); static DEFINE_PER_CPU(int, cpu_profile_flip); static DEFINE_MUTEX(profile_flip_mutex); @@ -59,6 +59,7 @@ int profile_setup(char *str) if (!strncmp(str, sleepstr, strlen(sleepstr))) { #ifdef CONFIG_SCHEDSTATS + force_schedstat_enabled(); prof_on = SLEEP_PROFILING; if (str[strlen(sleepstr)] == ',') str += strlen(sleepstr) + 1; @@ -201,7 +202,7 @@ int profile_event_unregister(enum profile_type type, struct notifier_block *n) } EXPORT_SYMBOL_GPL(profile_event_unregister); -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS) /* * Each cpu has a pair of open-addressed hashtables for pending * profile hits. read_profile() IPI's all cpus to request them diff --git a/kernel/ptrace.c b/kernel/ptrace.c index b760bae64cf1..d49bfa1e53e6 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -73,12 +73,11 @@ void __ptrace_unlink(struct task_struct *child) { BUG_ON(!child->ptrace); - child->ptrace = 0; child->parent = child->real_parent; list_del_init(&child->ptrace_entry); spin_lock(&child->sighand->siglock); - + child->ptrace = 0; /* * Clear all pending traps and TRAPPING. TRAPPING should be * cleared regardless of JOBCTL_STOP_PENDING. Do it explicitly. @@ -219,6 +218,14 @@ static int ptrace_has_cap(struct user_namespace *ns, unsigned int mode) static int __ptrace_may_access(struct task_struct *task, unsigned int mode) { const struct cred *cred = current_cred(), *tcred; + int dumpable = 0; + kuid_t caller_uid; + kgid_t caller_gid; + + if (!(mode & PTRACE_MODE_FSCREDS) == !(mode & PTRACE_MODE_REALCREDS)) { + WARN(1, "denying ptrace access check without PTRACE_MODE_*CREDS\n"); + return -EPERM; + } /* May we inspect the given task? * This check is used both for attaching with ptrace @@ -228,18 +235,33 @@ static int __ptrace_may_access(struct task_struct *task, unsigned int mode) * because setting up the necessary parent/child relationship * or halting the specified task is impossible. */ - int dumpable = 0; + /* Don't let security modules deny introspection */ if (same_thread_group(task, current)) return 0; rcu_read_lock(); + if (mode & PTRACE_MODE_FSCREDS) { + caller_uid = cred->fsuid; + caller_gid = cred->fsgid; + } else { + /* + * Using the euid would make more sense here, but something + * in userland might rely on the old behavior, and this + * shouldn't be a security problem since + * PTRACE_MODE_REALCREDS implies that the caller explicitly + * used a syscall that requests access to another process + * (and not a filesystem syscall to procfs). + */ + caller_uid = cred->uid; + caller_gid = cred->gid; + } tcred = __task_cred(task); - if (uid_eq(cred->uid, tcred->euid) && - uid_eq(cred->uid, tcred->suid) && - uid_eq(cred->uid, tcred->uid) && - gid_eq(cred->gid, tcred->egid) && - gid_eq(cred->gid, tcred->sgid) && - gid_eq(cred->gid, tcred->gid)) + if (uid_eq(caller_uid, tcred->euid) && + uid_eq(caller_uid, tcred->suid) && + uid_eq(caller_uid, tcred->uid) && + gid_eq(caller_gid, tcred->egid) && + gid_eq(caller_gid, tcred->sgid) && + gid_eq(caller_gid, tcred->gid)) goto ok; if (ptrace_has_cap(tcred->user_ns, mode)) goto ok; @@ -306,7 +328,7 @@ static int ptrace_attach(struct task_struct *task, long request, goto out; task_lock(task); - retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH); + retval = __ptrace_may_access(task, PTRACE_MODE_ATTACH_REALCREDS); task_unlock(task); if (retval) goto unlock_creds; @@ -364,8 +386,14 @@ unlock_creds: mutex_unlock(&task->signal->cred_guard_mutex); out: if (!retval) { - wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, - TASK_UNINTERRUPTIBLE); + /* + * We do not bother to change retval or clear JOBCTL_TRAPPING + * if wait_on_bit() was interrupted by SIGKILL. The tracer will + * not return to user-mode, it will exit and clear this bit in + * __ptrace_unlink() if it wasn't already cleared by the tracee; + * and until then nobody can ptrace this task. + */ + wait_on_bit(&task->jobctl, JOBCTL_TRAPPING_BIT, TASK_KILLABLE); proc_ptrace_connector(task, PTRACE_ATTACH); } @@ -652,7 +680,7 @@ static int ptrace_peek_siginfo(struct task_struct *child, break; #ifdef CONFIG_COMPAT - if (unlikely(is_compat_task())) { + if (unlikely(in_compat_syscall())) { compat_siginfo_t __user *uinfo = compat_ptr(data); if (copy_siginfo_to_user32(uinfo, &info) || diff --git a/kernel/rcu/Makefile b/kernel/rcu/Makefile index 61a16569ffbf..032b2c015beb 100644 --- a/kernel/rcu/Makefile +++ b/kernel/rcu/Makefile @@ -1,3 +1,7 @@ +# Any varying coverage in these files is non-deterministic +# and is generally not a function of system call inputs. +KCOV_INSTRUMENT := n + obj-y += update.o sync.o obj-$(CONFIG_SRCU) += srcu.o obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d89328e260df..250ea67c1615 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -130,10 +130,8 @@ static struct rcu_torture __rcu *rcu_torture_current; static unsigned long rcu_torture_current_version; static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN]; static DEFINE_SPINLOCK(rcu_torture_lock); -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_count) = { 0 }; -static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], - rcu_torture_batch) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) = { 0 }; +static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) = { 0 }; static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1]; static atomic_t n_rcu_torture_alloc; static atomic_t n_rcu_torture_alloc_fail; @@ -162,6 +160,27 @@ static int rcu_torture_writer_state; #define RTWS_SYNC 7 #define RTWS_STUTTER 8 #define RTWS_STOPPING 9 +static const char * const rcu_torture_writer_state_names[] = { + "RTWS_FIXED_DELAY", + "RTWS_DELAY", + "RTWS_REPLACE", + "RTWS_DEF_FREE", + "RTWS_EXP_SYNC", + "RTWS_COND_GET", + "RTWS_COND_SYNC", + "RTWS_SYNC", + "RTWS_STUTTER", + "RTWS_STOPPING", +}; + +static const char *rcu_torture_writer_state_getname(void) +{ + unsigned int i = READ_ONCE(rcu_torture_writer_state); + + if (i >= ARRAY_SIZE(rcu_torture_writer_state_names)) + return "???"; + return rcu_torture_writer_state_names[i]; +} #if defined(MODULE) || defined(CONFIG_RCU_TORTURE_TEST_RUNNABLE) #define RCUTORTURE_RUNNABLE_INIT 1 @@ -911,12 +930,14 @@ rcu_torture_writer(void *arg) int nsynctypes = 0; VERBOSE_TOROUT_STRING("rcu_torture_writer task started"); - pr_alert("%s" TORTURE_FLAG - " Grace periods expedited from boot/sysfs for %s,\n", - torture_type, cur_ops->name); - pr_alert("%s" TORTURE_FLAG - " Testing of dynamic grace-period expediting diabled.\n", - torture_type); + if (!can_expedite) { + pr_alert("%s" TORTURE_FLAG + " Grace periods expedited from boot/sysfs for %s,\n", + torture_type, cur_ops->name); + pr_alert("%s" TORTURE_FLAG + " Disabled dynamic grace-period expediting.\n", + torture_type); + } /* Initialize synctype[] array. If none set, take default. */ if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1) @@ -1307,7 +1328,8 @@ rcu_torture_stats_print(void) rcutorture_get_gp_data(cur_ops->ttype, &flags, &gpnum, &completed); - pr_alert("??? Writer stall state %d g%lu c%lu f%#x\n", + pr_alert("??? Writer stall state %s(%d) g%lu c%lu f%#x\n", + rcu_torture_writer_state_getname(), rcu_torture_writer_state, gpnum, completed, flags); show_rcu_gp_kthreads(); diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c index a63a1ea5a41b..9b9cdd549caa 100644 --- a/kernel/rcu/srcu.c +++ b/kernel/rcu/srcu.c @@ -489,7 +489,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount) */ void synchronize_srcu(struct srcu_struct *sp) { - __synchronize_srcu(sp, rcu_gp_is_expedited() + __synchronize_srcu(sp, (rcu_gp_is_expedited() && !rcu_gp_is_normal()) ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT : SYNCHRONIZE_SRCU_TRYCOUNT); } diff --git a/kernel/rcu/tiny_plugin.h b/kernel/rcu/tiny_plugin.h index e492a5253e0f..196f0302e2f4 100644 --- a/kernel/rcu/tiny_plugin.h +++ b/kernel/rcu/tiny_plugin.h @@ -23,7 +23,7 @@ */ #include <linux/kthread.h> -#include <linux/module.h> +#include <linux/init.h> #include <linux/debugfs.h> #include <linux/seq_file.h> @@ -122,18 +122,7 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutiny_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - -module_init(rcutiny_trace_init); -module_exit(rcutiny_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for tiny implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutiny_trace_init); static void check_cpu_stall(struct rcu_ctrlblk *rcp) { diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..9a535a86e732 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -68,10 +68,6 @@ MODULE_ALIAS("rcutree"); /* Data structures. */ -static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; -static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; - /* * In order to export the rcu_state name to the tracing tools, it * needs to be added in the __tracepoint_string section. @@ -112,7 +108,6 @@ RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched); RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh); static struct rcu_state *const rcu_state_p; -static struct rcu_data __percpu *const rcu_data_p; LIST_HEAD(rcu_struct_flavors); /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -246,24 +241,17 @@ static int rcu_gp_in_progress(struct rcu_state *rsp) */ void rcu_sched_qs(void) { - unsigned long flags; - - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) { - trace_rcu_grace_period(TPS("rcu_sched"), - __this_cpu_read(rcu_sched_data.gpnum), - TPS("cpuqs")); - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); - if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) - return; - local_irq_save(flags); - if (__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) { - __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(&rcu_sched_data), - true); - } - local_irq_restore(flags); - } + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.s)) + return; + trace_rcu_grace_period(TPS("rcu_sched"), + __this_cpu_read(rcu_sched_data.gpnum), + TPS("cpuqs")); + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.norm, false); + if (!__this_cpu_read(rcu_sched_data.cpu_no_qs.b.exp)) + return; + __this_cpu_write(rcu_sched_data.cpu_no_qs.b.exp, false); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(&rcu_sched_data), true); } void rcu_bh_qs(void) @@ -300,17 +288,16 @@ EXPORT_PER_CPU_SYMBOL_GPL(rcu_qs_ctr); * We inform the RCU core by emulating a zero-duration dyntick-idle * period, which we in turn do by incrementing the ->dynticks counter * by two. + * + * The caller must have disabled interrupts. */ static void rcu_momentary_dyntick_idle(void) { - unsigned long flags; struct rcu_data *rdp; struct rcu_dynticks *rdtp; int resched_mask; struct rcu_state *rsp; - local_irq_save(flags); - /* * Yes, we can lose flag-setting operations. This is OK, because * the flag will be set again after some delay. @@ -340,13 +327,12 @@ static void rcu_momentary_dyntick_idle(void) smp_mb__after_atomic(); /* Later stuff after QS. */ break; } - local_irq_restore(flags); } /* * Note a context switch. This is a quiescent state for RCU-sched, * and requires special handling for preemptible RCU. - * The caller must have disabled preemption. + * The caller must have disabled interrupts. */ void rcu_note_context_switch(void) { @@ -376,9 +362,14 @@ EXPORT_SYMBOL_GPL(rcu_note_context_switch); */ void rcu_all_qs(void) { + unsigned long flags; + barrier(); /* Avoid RCU read-side critical sections leaking down. */ - if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) + if (unlikely(raw_cpu_read(rcu_sched_qs_mask))) { + local_irq_save(flags); rcu_momentary_dyntick_idle(); + local_irq_restore(flags); + } this_cpu_inc(rcu_qs_ctr); barrier(); /* Avoid RCU read-side critical sections leaking up. */ } @@ -605,25 +596,25 @@ static int rcu_future_needs_gp(struct rcu_state *rsp) * The caller must have disabled interrupts to prevent races with * normal callback registry. */ -static int +static bool cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp) { int i; if (rcu_gp_in_progress(rsp)) - return 0; /* No, a grace period is already in progress. */ + return false; /* No, a grace period is already in progress. */ if (rcu_future_needs_gp(rsp)) - return 1; /* Yes, a no-CBs CPU needs one. */ + return true; /* Yes, a no-CBs CPU needs one. */ if (!rdp->nxttail[RCU_NEXT_TAIL]) - return 0; /* No, this is a no-CBs (or offline) CPU. */ + return false; /* No, this is a no-CBs (or offline) CPU. */ if (*rdp->nxttail[RCU_NEXT_READY_TAIL]) - return 1; /* Yes, this CPU has newly registered callbacks. */ + return true; /* Yes, CPU has newly registered callbacks. */ for (i = RCU_WAIT_TAIL; i < RCU_NEXT_TAIL; i++) if (rdp->nxttail[i - 1] != rdp->nxttail[i] && ULONG_CMP_LT(READ_ONCE(rsp->completed), rdp->nxtcompleted[i])) - return 1; /* Yes, CBs for future grace period. */ - return 0; /* No grace period needed. */ + return true; /* Yes, CBs for future grace period. */ + return false; /* No grace period needed. */ } /* @@ -740,7 +731,7 @@ void rcu_user_enter(void) * * Exit from an interrupt handler, which might possibly result in entering * idle mode, in other words, leaving the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * This code assumes that the idle loop never does anything that might * result in unbalanced calls to irq_enter() and irq_exit(). If your @@ -753,11 +744,10 @@ void rcu_user_enter(void) */ void rcu_irq_exit(void) { - unsigned long flags; long long oldval; struct rcu_dynticks *rdtp; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; @@ -768,6 +758,17 @@ void rcu_irq_exit(void) else rcu_eqs_enter_common(oldval, true); rcu_sysidle_enter(1); +} + +/* + * Wrapper for rcu_irq_exit() where interrupts are enabled. + */ +void rcu_irq_exit_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_exit(); local_irq_restore(flags); } @@ -865,7 +866,7 @@ void rcu_user_exit(void) * * Enter an interrupt handler, which might possibly result in exiting * idle mode, in other words, entering the mode in which read-side critical - * sections can occur. + * sections can occur. The caller must have disabled interrupts. * * Note that the Linux kernel is fully capable of entering an interrupt * handler that it never exits, for example when doing upcalls to @@ -881,11 +882,10 @@ void rcu_user_exit(void) */ void rcu_irq_enter(void) { - unsigned long flags; struct rcu_dynticks *rdtp; long long oldval; - local_irq_save(flags); + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); rdtp = this_cpu_ptr(&rcu_dynticks); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; @@ -896,6 +896,17 @@ void rcu_irq_enter(void) else rcu_eqs_exit_common(oldval, true); rcu_sysidle_exit(1); +} + +/* + * Wrapper for rcu_irq_enter() where interrupts are enabled. + */ +void rcu_irq_enter_irqson(void) +{ + unsigned long flags; + + local_irq_save(flags); + rcu_irq_enter(); local_irq_restore(flags); } @@ -1071,13 +1082,12 @@ static int dyntick_save_progress_counter(struct rcu_data *rdp, rcu_sysidle_check_cpu(rdp, isidle, maxj); if ((rdp->dynticks_snap & 0x1) == 0) { trace_rcu_fqs(rdp->rsp->name, rdp->gpnum, rdp->cpu, TPS("dti")); - return 1; - } else { if (ULONG_CMP_LT(READ_ONCE(rdp->gpnum) + ULONG_MAX / 4, rdp->mynode->gpnum)) WRITE_ONCE(rdp->gpwrap, true); - return 0; + return 1; } + return 0; } /* @@ -1161,15 +1171,16 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp, smp_mb(); /* ->cond_resched_completed before *rcrmp. */ WRITE_ONCE(*rcrmp, READ_ONCE(*rcrmp) + rdp->rsp->flavor_mask); - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Enable beating. */ - } else if (ULONG_CMP_GE(jiffies, rdp->rsp->jiffies_resched)) { - /* Time to beat on that CPU again! */ - resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ - rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + rdp->rsp->jiffies_resched += 5; /* Re-enable beating. */ } + /* And if it has been a really long time, kick the CPU as well. */ + if (ULONG_CMP_GE(jiffies, + rdp->rsp->gp_start + 2 * jiffies_till_sched_qs) || + ULONG_CMP_GE(jiffies, rdp->rsp->gp_start + jiffies_till_sched_qs)) + resched_cpu(rdp->cpu); /* Force CPU into scheduler. */ + return 0; } @@ -1187,6 +1198,16 @@ static void record_gp_stall_check_time(struct rcu_state *rsp) } /* + * Convert a ->gp_state value to a character string. + */ +static const char *gp_state_getname(short gs) +{ + if (gs < 0 || gs >= ARRAY_SIZE(gp_state_names)) + return "???"; + return gp_state_names[gs]; +} + +/* * Complain about starvation of grace-period kthread. */ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) @@ -1196,12 +1217,16 @@ static void rcu_check_gp_kthread_starvation(struct rcu_state *rsp) j = jiffies; gpa = READ_ONCE(rsp->gp_activity); - if (j - gpa > 2 * HZ) - pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x s%d ->state=%#lx\n", + if (j - gpa > 2 * HZ) { + pr_err("%s kthread starved for %ld jiffies! g%lu c%lu f%#x %s(%d) ->state=%#lx\n", rsp->name, j - gpa, rsp->gpnum, rsp->completed, - rsp->gp_flags, rsp->gp_state, - rsp->gp_kthread ? rsp->gp_kthread->state : 0); + rsp->gp_flags, + gp_state_getname(rsp->gp_state), rsp->gp_state, + rsp->gp_kthread ? rsp->gp_kthread->state : ~0); + if (rsp->gp_kthread) + sched_show_task(rsp->gp_kthread); + } } /* @@ -1214,13 +1239,13 @@ static void rcu_dump_cpu_stacks(struct rcu_state *rsp) struct rcu_node *rnp; rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) if (rnp->qsmask & (1UL << cpu)) dump_cpu_task(rnp->grplo + cpu); } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1237,15 +1262,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) /* Only let one CPU complain about others per time interval. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); delta = jiffies - READ_ONCE(rsp->jiffies_stall); if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * OK, time to rat on our buddy... @@ -1256,7 +1281,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) rsp->name); print_cpu_stall_info_begin(); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); ndetected += rcu_print_task_stall(rnp); if (rnp->qsmask != 0) { for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) @@ -1266,7 +1291,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum) ndetected++; } } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } print_cpu_stall_info_end(); @@ -1327,11 +1352,11 @@ static void print_cpu_stall(struct rcu_state *rsp) rcu_dump_cpu_stacks(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (ULONG_CMP_GE(jiffies, READ_ONCE(rsp->jiffies_stall))) WRITE_ONCE(rsp->jiffies_stall, jiffies + 3 * rcu_jiffies_till_stall_check() + 3); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* * Attempt to revive the RCU machinery by forcing a context switch. @@ -1534,10 +1559,8 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, * hold it, acquire the root rcu_node structure's lock in order to * start one (if needed). */ - if (rnp != rnp_root) { - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); - } + if (rnp != rnp_root) + raw_spin_lock_rcu_node(rnp_root); /* * Get a new grace-period number. If there really is no grace @@ -1571,7 +1594,7 @@ rcu_start_future_gp(struct rcu_node *rnp, struct rcu_data *rdp, } unlock_out: if (rnp != rnp_root) - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); out: if (c_out != NULL) *c_out = c; @@ -1590,7 +1613,6 @@ static int rcu_future_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) int needmore; struct rcu_data *rdp = this_cpu_ptr(rsp->rda); - rcu_nocb_gp_cleanup(rsp, rnp); rnp->need_future_gp[c & 0x1] = 0; needmore = rnp->need_future_gp[(c + 1) & 0x1]; trace_rcu_future_gp(rnp, rdp, c, @@ -1611,7 +1633,7 @@ static void rcu_gp_kthread_wake(struct rcu_state *rsp) !READ_ONCE(rsp->gp_flags) || !rsp->gp_kthread) return; - wake_up(&rsp->gp_wq); + swake_up(&rsp->gp_wq); } /* @@ -1786,13 +1808,12 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp) if ((rdp->gpnum == READ_ONCE(rnp->gpnum) && rdp->completed == READ_ONCE(rnp->completed) && !unlikely(READ_ONCE(rdp->gpwrap))) || /* w/out lock. */ - !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ + !raw_spin_trylock_rcu_node(rnp)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } - smp_mb__after_unlock_lock(); needwake = __note_gp_changes(rsp, rnp, rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1805,21 +1826,20 @@ static void rcu_gp_slow(struct rcu_state *rsp, int delay) } /* - * Initialize a new grace period. Return 0 if no grace period required. + * Initialize a new grace period. Return false if no grace period required. */ -static int rcu_gp_init(struct rcu_state *rsp) +static bool rcu_gp_init(struct rcu_state *rsp) { unsigned long oldmask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (!READ_ONCE(rsp->gp_flags)) { /* Spurious wakeup, tell caller to go back to sleep. */ - raw_spin_unlock_irq(&rnp->lock); - return 0; + raw_spin_unlock_irq_rcu_node(rnp); + return false; } WRITE_ONCE(rsp->gp_flags, 0); /* Clear all flags: New grace period. */ @@ -1828,8 +1848,8 @@ static int rcu_gp_init(struct rcu_state *rsp) * Grace period already in progress, don't start another. * Not supposed to be able to happen. */ - raw_spin_unlock_irq(&rnp->lock); - return 0; + raw_spin_unlock_irq_rcu_node(rnp); + return false; } /* Advance to a new grace period and initialize state. */ @@ -1837,7 +1857,7 @@ static int rcu_gp_init(struct rcu_state *rsp) /* Record GP times before starting GP, hence smp_store_release(). */ smp_store_release(&rsp->gpnum, rsp->gpnum + 1); trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start")); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Apply per-leaf buffered online and offline operations to the @@ -1847,12 +1867,11 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_leaf_node(rsp, rnp) { rcu_gp_slow(rsp, gp_preinit_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); if (rnp->qsmaskinit == rnp->qsmaskinitnext && !rnp->wait_blkd_tasks) { /* Nothing to do on this leaf rcu_node structure. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); continue; } @@ -1886,7 +1905,7 @@ static int rcu_gp_init(struct rcu_state *rsp) rcu_cleanup_dead_rnp(rnp); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -1904,8 +1923,7 @@ static int rcu_gp_init(struct rcu_state *rsp) */ rcu_for_each_node_breadth_first(rsp, rnp) { rcu_gp_slow(rsp, gp_init_delay); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); rdp = this_cpu_ptr(rsp->rda); rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; @@ -1918,12 +1936,12 @@ static int rcu_gp_init(struct rcu_state *rsp) trace_rcu_grace_period_init(rsp->name, rnp->gpnum, rnp->level, rnp->grplo, rnp->grphi, rnp->qsmask); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); } - return 1; + return true; } /* @@ -1973,11 +1991,10 @@ static void rcu_gp_fqs(struct rcu_state *rsp, bool first_time) } /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) & ~RCU_GP_FLAG_FQS); - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } } @@ -1991,10 +2008,10 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) int nocb = 0; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(rsp); + struct swait_queue_head *sq; WRITE_ONCE(rsp->gp_activity, jiffies); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); gp_duration = jiffies - rsp->gp_start; if (gp_duration > rsp->gp_max) rsp->gp_max = gp_duration; @@ -2007,7 +2024,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * safe for us to drop the lock in order to mark the grace * period as completed in all of the rcu_node structures. */ - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); /* * Propagate new ->completed value to rcu_node structures so @@ -2019,8 +2036,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) * grace period is recorded in any of the rcu_node structures. */ rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_irq_rcu_node(rnp); WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp)); WARN_ON_ONCE(rnp->qsmask); WRITE_ONCE(rnp->completed, rsp->gpnum); @@ -2029,14 +2045,15 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) needgp = __note_gp_changes(rsp, rnp, rdp) || needgp; /* smp_mb() provided by prior unlock-lock pair. */ nocb += rcu_future_gp_cleanup(rsp, rnp); - raw_spin_unlock_irq(&rnp->lock); + sq = rcu_nocb_gp_get(rnp); + raw_spin_unlock_irq_rcu_node(rnp); + rcu_nocb_gp_cleanup(sq); cond_resched_rcu_qs(); WRITE_ONCE(rsp->gp_activity, jiffies); rcu_gp_slow(rsp, gp_cleanup_delay); } rnp = rcu_get_root(rsp); - raw_spin_lock_irq(&rnp->lock); - smp_mb__after_unlock_lock(); /* Order GP before ->completed update. */ + raw_spin_lock_irq_rcu_node(rnp); /* Order GP before ->completed update. */ rcu_nocb_gp_set(rnp, nocb); /* Declare grace period done. */ @@ -2052,7 +2069,7 @@ static void rcu_gp_cleanup(struct rcu_state *rsp) READ_ONCE(rsp->gpnum), TPS("newreq")); } - raw_spin_unlock_irq(&rnp->lock); + raw_spin_unlock_irq_rcu_node(rnp); } /* @@ -2076,7 +2093,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("reqwait")); rsp->gp_state = RCU_GP_WAIT_GPS; - wait_event_interruptible(rsp->gp_wq, + swait_event_interruptible(rsp->gp_wq, READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_INIT); rsp->gp_state = RCU_GP_DONE_GPS; @@ -2106,7 +2123,7 @@ static int __noreturn rcu_gp_kthread(void *arg) READ_ONCE(rsp->gpnum), TPS("fqswait")); rsp->gp_state = RCU_GP_WAIT_FQS; - ret = wait_event_interruptible_timeout(rsp->gp_wq, + ret = swait_event_interruptible_timeout(rsp->gp_wq, rcu_gp_fqs_check_wake(rsp, &gf), j); rsp->gp_state = RCU_GP_DOING_FQS; /* Locking provides needed memory barriers. */ @@ -2218,19 +2235,21 @@ static bool rcu_start_gp(struct rcu_state *rsp) } /* - * Report a full set of quiescent states to the specified rcu_state - * data structure. This involves cleaning up after the prior grace - * period and letting rcu_start_gp() start up the next grace period - * if one is needed. Note that the caller must hold rnp->lock, which - * is released before return. + * Report a full set of quiescent states to the specified rcu_state data + * structure. Invoke rcu_gp_kthread_wake() to awaken the grace-period + * kthread if another grace period is required. Whether we wake + * the grace-period kthread or it awakens itself for the next round + * of quiescent-state forcing, that kthread will clean up after the + * just-completed grace period. Note that the caller must hold rnp->lock, + * which is released before return. */ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags) __releases(rcu_get_root(rsp)->lock) { WARN_ON_ONCE(!rcu_gp_in_progress(rsp)); WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); - rcu_gp_kthread_wake(rsp); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2259,7 +2278,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, * Our bit has already been cleared, or the * relevant grace period is already over, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */ @@ -2271,7 +2290,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { /* Other bits still set at this level, so done. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rnp->grpmask; @@ -2281,11 +2300,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, break; } - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rnp_c = rnp; rnp = rnp->parent; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); oldmask = rnp_c->qsmask; } @@ -2314,7 +2332,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; /* Still need more quiescent states! */ } @@ -2331,20 +2349,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp, /* Report up the rest of the hierarchy, tracking current ->gpnum. */ gps = rnp->gpnum; mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ + raw_spin_lock_rcu_node(rnp_p); /* irqs already disabled. */ rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags); } /* * Record a quiescent state for the specified CPU to that CPU's rcu_data - * structure. This must be either called from the specified CPU, or - * called when the specified CPU is known to be offline (and when it is - * also known that no other CPU is concurrently trying to help the offline - * CPU). The lastcomp argument is used to make sure we are still in the - * grace period of interest. We don't want to end the current grace period - * based on quiescent states detected in an earlier grace period! + * structure. This must be called from the specified CPU. */ static void rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) @@ -2355,8 +2367,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) struct rcu_node *rnp; rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if ((rdp->cpu_no_qs.b.norm && rdp->rcu_qs_ctr_snap == __this_cpu_read(rcu_qs_ctr)) || rdp->gpnum != rnp->gpnum || rnp->completed == rnp->gpnum || @@ -2370,14 +2381,14 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp) */ rdp->cpu_no_qs.b.norm = true; /* need qs for new gp. */ rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } else { - rdp->core_needs_qs = 0; + rdp->core_needs_qs = false; /* * This GP can't end until cpu checks in, so all of our @@ -2582,42 +2593,19 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (!rnp) break; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); /* GP memory ordering. */ + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; rnp->qsmask &= ~mask; if (rnp->qsmaskinit) { - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); + /* irqs remain disabled. */ return; } - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ } } /* - * The CPU is exiting the idle loop into the arch_cpu_idle_dead() - * function. We now remove it from the rcu_node tree's ->qsmaskinit - * bit masks. - */ -static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) -{ - unsigned long flags; - unsigned long mask; - struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); - struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ - - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return; - - /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ - mask = rdp->grpmask; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); /* Enforce GP memory-order guarantee. */ - rnp->qsmaskinitnext &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); -} - -/* * The CPU has been completely removed, and some other CPU is reporting * this fact from process context. Do the remainder of the cleanup, * including orphaning the outgoing CPU's RCU callbacks, and also @@ -2809,8 +2797,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_for_each_leaf_node(rsp, rnp) { cond_resched_rcu_qs(); mask = 0; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->qsmask == 0) { if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p || @@ -2849,7 +2836,7 @@ static void force_qs_rnp(struct rcu_state *rsp, rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags); } else { /* Nothing to do here, so just drop the lock. */ - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } } @@ -2881,17 +2868,16 @@ static void force_quiescent_state(struct rcu_state *rsp) /* rnp_old == rcu_get_root(rsp), rnp == NULL. */ /* Reached the root of the rcu_node tree, acquire lock. */ - raw_spin_lock_irqsave(&rnp_old->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_old, flags); raw_spin_unlock(&rnp_old->fqslock); if (READ_ONCE(rsp->gp_flags) & RCU_GP_FLAG_FQS) { rsp->n_force_qs_lh++; - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } WRITE_ONCE(rsp->gp_flags, READ_ONCE(rsp->gp_flags) | RCU_GP_FLAG_FQS); - raw_spin_unlock_irqrestore(&rnp_old->lock, flags); - rcu_gp_kthread_wake(rsp); + raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); + swake_up(&rsp->gp_wq); /* Memory barrier implied by swake_up() path. */ } /* @@ -2914,9 +2900,9 @@ __rcu_process_callbacks(struct rcu_state *rsp) /* Does this CPU require a not-yet-started grace period? */ local_irq_save(flags); if (cpu_needs_another_gp(rsp, rdp)) { - raw_spin_lock(&rcu_get_root(rsp)->lock); /* irqs disabled. */ + raw_spin_lock_rcu_node(rcu_get_root(rsp)); /* irqs disabled. */ needwake = rcu_start_gp(rsp); - raw_spin_unlock_irqrestore(&rcu_get_root(rsp)->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(rsp), flags); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3005,10 +2991,9 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp, if (!rcu_gp_in_progress(rsp)) { struct rcu_node *rnp_root = rcu_get_root(rsp); - raw_spin_lock(&rnp_root->lock); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp_root); needwake = rcu_start_gp(rsp); - raw_spin_unlock(&rnp_root->lock); + raw_spin_unlock_rcu_node(rnp_root); if (needwake) rcu_gp_kthread_wake(rsp); } else { @@ -3365,7 +3350,6 @@ static unsigned long rcu_seq_snap(unsigned long *sp) { unsigned long s; - smp_mb(); /* Caller's modifications seen first by other CPUs. */ s = (READ_ONCE(*sp) + 3) & ~0x1; smp_mb(); /* Above access must not bleed into critical section. */ return s; @@ -3392,6 +3376,7 @@ static void rcu_exp_gp_seq_end(struct rcu_state *rsp) } static unsigned long rcu_exp_gp_seq_snap(struct rcu_state *rsp) { + smp_mb(); /* Caller's modifications seen first by other CPUs. */ return rcu_seq_snap(&rsp->expedited_sequence); } static bool rcu_exp_gp_seq_done(struct rcu_state *rsp, unsigned long s) @@ -3426,17 +3411,16 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) * CPUs for the current rcu_node structure up the rcu_node tree. */ rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (rnp->expmaskinit == rnp->expmaskinitnext) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); continue; /* No new CPUs, nothing to do. */ } /* Update this node's mask, track old value for propagation. */ oldmask = rnp->expmaskinit; rnp->expmaskinit = rnp->expmaskinitnext; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* If was already nonzero, nothing to propagate. */ if (oldmask) @@ -3447,12 +3431,11 @@ static void sync_exp_reset_tree_hotplug(struct rcu_state *rsp) rnp_up = rnp->parent; done = false; while (rnp_up) { - raw_spin_lock_irqsave(&rnp_up->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp_up, flags); if (rnp_up->expmaskinit) done = true; rnp_up->expmaskinit |= mask; - raw_spin_unlock_irqrestore(&rnp_up->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp_up, flags); if (done) break; mask = rnp_up->grpmask; @@ -3472,11 +3455,10 @@ static void __maybe_unused sync_exp_reset_tree(struct rcu_state *rsp) sync_exp_reset_tree_hotplug(rsp); rcu_for_each_node_breadth_first(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); WARN_ON_ONCE(rnp->expmask); rnp->expmask = rnp->expmaskinit; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -3517,22 +3499,21 @@ static void __rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp, if (!rnp->expmask) rcu_initiate_boost(rnp, flags); else - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); break; } if (rnp->parent == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (wake) { smp_mb(); /* EGP done before wake_up(). */ - wake_up(&rsp->expedited_wq); + swake_up(&rsp->expedited_wq); } break; } mask = rnp->grpmask; - raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled */ rnp = rnp->parent; - raw_spin_lock(&rnp->lock); /* irqs already disabled */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled */ WARN_ON_ONCE(!(rnp->expmask & mask)); rnp->expmask &= ~mask; } @@ -3549,8 +3530,7 @@ static void __maybe_unused rcu_report_exp_rnp(struct rcu_state *rsp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); __rcu_report_exp_rnp(rsp, rnp, wake, flags); } @@ -3564,10 +3544,9 @@ static void rcu_report_exp_cpu_mult(struct rcu_state *rsp, struct rcu_node *rnp, { unsigned long flags; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!(rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } rnp->expmask &= ~mask; @@ -3609,7 +3588,7 @@ static bool sync_exp_work_done(struct rcu_state *rsp, struct rcu_node *rnp, */ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) { - struct rcu_data *rdp; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); struct rcu_node *rnp0; struct rcu_node *rnp1 = NULL; @@ -3623,7 +3602,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) if (!mutex_is_locked(&rnp0->exp_funnel_mutex)) { if (mutex_trylock(&rnp0->exp_funnel_mutex)) { if (sync_exp_work_done(rsp, rnp0, NULL, - &rsp->expedited_workdone0, s)) + &rdp->expedited_workdone0, s)) return NULL; return rnp0; } @@ -3637,14 +3616,13 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) * can be inexact, as it is just promoting locality and is not * strictly needed for correctness. */ - rdp = per_cpu_ptr(rsp->rda, raw_smp_processor_id()); - if (sync_exp_work_done(rsp, NULL, NULL, &rsp->expedited_workdone1, s)) + if (sync_exp_work_done(rsp, NULL, NULL, &rdp->expedited_workdone1, s)) return NULL; mutex_lock(&rdp->exp_funnel_mutex); rnp0 = rdp->mynode; for (; rnp0 != NULL; rnp0 = rnp0->parent) { if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone2, s)) + &rdp->expedited_workdone2, s)) return NULL; mutex_lock(&rnp0->exp_funnel_mutex); if (rnp1) @@ -3654,7 +3632,7 @@ static struct rcu_node *exp_funnel_lock(struct rcu_state *rsp, unsigned long s) rnp1 = rnp0; } if (sync_exp_work_done(rsp, rnp1, rdp, - &rsp->expedited_workdone3, s)) + &rdp->expedited_workdone3, s)) return NULL; return rnp1; } @@ -3708,8 +3686,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, sync_exp_reset_tree(rsp); rcu_for_each_leaf_node(rsp, rnp) { - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Each pass checks a CPU for identity, offline, and idle. */ mask_ofl_test = 0; @@ -3730,7 +3707,7 @@ static void sync_rcu_exp_select_cpus(struct rcu_state *rsp, */ if (rcu_preempt_has_tasks(rnp)) rnp->exp_tasks = rnp->blkd_tasks.next; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* IPI the remaining CPUs for expedited quiescent state. */ mask = 1; @@ -3741,24 +3718,22 @@ retry_ipi: ret = smp_call_function_single(cpu, func, rsp, 0); if (!ret) { mask_ofl_ipi &= ~mask; - } else { - /* Failed, raced with offline. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + continue; + } + /* Failed, raced with offline. */ + raw_spin_lock_irqsave_rcu_node(rnp, flags); + if (cpu_online(cpu) && + (rnp->expmask & mask)) { + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + schedule_timeout_uninterruptible(1); if (cpu_online(cpu) && - (rnp->expmask & mask)) { - raw_spin_unlock_irqrestore(&rnp->lock, - flags); - schedule_timeout_uninterruptible(1); - if (cpu_online(cpu) && - (rnp->expmask & mask)) - goto retry_ipi; - raw_spin_lock_irqsave(&rnp->lock, - flags); - } - if (!(rnp->expmask & mask)) - mask_ofl_ipi &= ~mask; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + (rnp->expmask & mask)) + goto retry_ipi; + raw_spin_lock_irqsave_rcu_node(rnp, flags); } + if (!(rnp->expmask & mask)) + mask_ofl_ipi &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Report quiescent states for those that went offline. */ mask_ofl_test |= mask_ofl_ipi; @@ -3773,6 +3748,7 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) unsigned long jiffies_stall; unsigned long jiffies_start; unsigned long mask; + int ndetected; struct rcu_node *rnp; struct rcu_node *rnp_root = rcu_get_root(rsp); int ret; @@ -3781,28 +3757,30 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) jiffies_start = jiffies; for (;;) { - ret = wait_event_interruptible_timeout( + ret = swait_event_timeout( rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root), jiffies_stall); - if (ret > 0) + if (ret > 0 || sync_rcu_preempt_exp_done(rnp_root)) return; if (ret < 0) { /* Hit a signal, disable CPU stall warnings. */ - wait_event(rsp->expedited_wq, + swait_event(rsp->expedited_wq, sync_rcu_preempt_exp_done(rnp_root)); return; } pr_err("INFO: %s detected expedited stalls on CPUs/tasks: {", rsp->name); + ndetected = 0; rcu_for_each_leaf_node(rsp, rnp) { - (void)rcu_print_task_exp_stall(rnp); + ndetected = rcu_print_task_exp_stall(rnp); mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { struct rcu_data *rdp; if (!(rnp->expmask & mask)) continue; + ndetected++; rdp = per_cpu_ptr(rsp->rda, cpu); pr_cont(" %d-%c%c%c", cpu, "O."[cpu_online(cpu)], @@ -3811,8 +3789,23 @@ static void synchronize_sched_expedited_wait(struct rcu_state *rsp) } mask <<= 1; } - pr_cont(" } %lu jiffies s: %lu\n", - jiffies - jiffies_start, rsp->expedited_sequence); + pr_cont(" } %lu jiffies s: %lu root: %#lx/%c\n", + jiffies - jiffies_start, rsp->expedited_sequence, + rnp_root->expmask, ".T"[!!rnp_root->exp_tasks]); + if (!ndetected) { + pr_err("blocking rcu_node structures:"); + rcu_for_each_node_breadth_first(rsp, rnp) { + if (rnp == rnp_root) + continue; /* printed unconditionally */ + if (sync_rcu_preempt_exp_done(rnp)) + continue; + pr_cont(" l=%u:%d-%d:%#lx/%c", + rnp->level, rnp->grplo, rnp->grphi, + rnp->expmask, + ".T"[!!rnp->exp_tasks]); + } + pr_cont("\n"); + } rcu_for_each_leaf_node(rsp, rnp) { mask = 1; for (cpu = rnp->grplo; cpu <= rnp->grphi; cpu++, mask <<= 1) { @@ -3847,6 +3840,16 @@ void synchronize_sched_expedited(void) struct rcu_node *rnp; struct rcu_state *rsp = &rcu_sched_state; + /* If only one CPU, this is automatically a grace period. */ + if (rcu_blocking_is_gp()) + return; + + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu_sched); + return; + } + /* Take a snapshot of the sequence number. */ s = rcu_exp_gp_seq_snap(rsp); @@ -4135,9 +4138,9 @@ static void rcu_init_new_rnp(struct rcu_node *rnp_leaf) rnp = rnp->parent; if (rnp == NULL) return; - raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */ + raw_spin_lock_rcu_node(rnp); /* Interrupts already disabled. */ rnp->qsmaskinit |= mask; - raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* Interrupts remain disabled. */ } } @@ -4152,7 +4155,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); rdp->dynticks = &per_cpu(rcu_dynticks, cpu); WARN_ON_ONCE(rdp->dynticks->dynticks_nesting != DYNTICK_TASK_EXIT_IDLE); @@ -4161,7 +4164,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rsp = rsp; mutex_init(&rdp->exp_funnel_mutex); rcu_boot_init_nocb_percpu_data(rdp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -4179,7 +4182,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; @@ -4189,7 +4192,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rcu_sysidle_init_percpu_data(rdp->dynticks); atomic_set(&rdp->dynticks->dynticks, (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ /* * Add CPU to leaf rcu_node pending-online bitmask. Any needed @@ -4198,8 +4201,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) */ rnp = rdp->mynode; mask = rdp->grpmask; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ rnp->qsmaskinitnext |= mask; rnp->expmaskinitnext |= mask; if (!rdp->beenonline) @@ -4211,7 +4213,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->rcu_qs_ctr_snap = per_cpu(rcu_qs_ctr, cpu); rdp->core_needs_qs = false; trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl")); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void rcu_prepare_cpu(int cpu) @@ -4222,6 +4224,46 @@ static void rcu_prepare_cpu(int cpu) rcu_init_percpu_data(cpu, rsp); } +#ifdef CONFIG_HOTPLUG_CPU +/* + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + * The CPU is exiting the idle loop into the arch_cpu_idle_dead() + * function. We now remove it from the rcu_node tree's ->qsmaskinit + * bit masks. + */ +static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp) +{ + unsigned long flags; + unsigned long mask; + struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu); + struct rcu_node *rnp = rdp->mynode; /* Outgoing CPU's rdp & rnp. */ + + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return; + + /* Remove outgoing CPU from mask in the leaf rcu_node structure. */ + mask = rdp->grpmask; + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ + rnp->qsmaskinitnext &= ~mask; + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); +} + +void rcu_report_dead(unsigned int cpu) +{ + struct rcu_state *rsp; + + /* QS for any half-done expedited RCU-sched GP. */ + preempt_disable(); + rcu_report_exp_rdp(&rcu_sched_state, + this_cpu_ptr(rcu_sched_state.rda), true); + preempt_enable(); + for_each_rcu_flavor(rsp) + rcu_cleanup_dying_idle_cpu(cpu, rsp); +} +#endif + /* * Handle CPU online/offline notification events. */ @@ -4253,17 +4295,6 @@ int rcu_cpu_notify(struct notifier_block *self, for_each_rcu_flavor(rsp) rcu_cleanup_dying_cpu(rsp); break; - case CPU_DYING_IDLE: - /* QS for any half-done expedited RCU-sched GP. */ - preempt_disable(); - rcu_report_exp_rdp(&rcu_sched_state, - this_cpu_ptr(rcu_sched_state.rda), true); - preempt_enable(); - - for_each_rcu_flavor(rsp) { - rcu_cleanup_dying_idle_cpu(cpu, rsp); - } - break; case CPU_DEAD: case CPU_DEAD_FROZEN: case CPU_UP_CANCELED: @@ -4327,14 +4358,14 @@ static int __init rcu_spawn_gp_kthread(void) t = kthread_create(rcu_gp_kthread, rsp, "%s", rsp->name); BUG_ON(IS_ERR(t)); rnp = rcu_get_root(rsp); - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rsp->gp_kthread = t; if (kthread_prio) { sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); } + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); wake_up_process(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); } rcu_spawn_nocb_kthreads(); rcu_spawn_boost_kthreads(); @@ -4385,12 +4416,14 @@ static void __init rcu_init_levelspread(int *levelspread, const int *levelcnt) /* * Helper function for rcu_init() that initializes one rcu_state structure. */ -static void __init rcu_init_one(struct rcu_state *rsp, - struct rcu_data __percpu *rda) +static void __init rcu_init_one(struct rcu_state *rsp) { static const char * const buf[] = RCU_NODE_NAME_INIT; static const char * const fqs[] = RCU_FQS_NAME_INIT; static const char * const exp[] = RCU_EXP_NAME_INIT; + static struct lock_class_key rcu_node_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_fqs_class[RCU_NUM_LVLS]; + static struct lock_class_key rcu_exp_class[RCU_NUM_LVLS]; static u8 fl_mask = 0x1; int levelcnt[RCU_NUM_LVLS]; /* # nodes in each level. */ @@ -4422,8 +4455,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, cpustride *= levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < levelcnt[i]; j++, rnp++) { - raw_spin_lock_init(&rnp->lock); - lockdep_set_class_and_name(&rnp->lock, + raw_spin_lock_init(&ACCESS_PRIVATE(rnp, lock)); + lockdep_set_class_and_name(&ACCESS_PRIVATE(rnp, lock), &rcu_node_class[i], buf[i]); raw_spin_lock_init(&rnp->fqslock); lockdep_set_class_and_name(&rnp->fqslock, @@ -4455,8 +4488,8 @@ static void __init rcu_init_one(struct rcu_state *rsp, } } - init_waitqueue_head(&rsp->gp_wq); - init_waitqueue_head(&rsp->expedited_wq); + init_swait_queue_head(&rsp->gp_wq); + init_swait_queue_head(&rsp->expedited_wq); rnp = rsp->level[rcu_num_lvls - 1]; for_each_possible_cpu(i) { while (i > rnp->grphi) @@ -4576,8 +4609,8 @@ void __init rcu_init(void) rcu_bootup_announce(); rcu_init_geometry(); - rcu_init_one(&rcu_bh_state, &rcu_bh_data); - rcu_init_one(&rcu_sched_state, &rcu_sched_data); + rcu_init_one(&rcu_bh_state); + rcu_init_one(&rcu_sched_state); if (dump_tree) rcu_dump_rcu_node_tree(&rcu_sched_state); __rcu_init_preempt(); diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index 9fb4e238d4dc..df668c0f9e64 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -27,6 +27,7 @@ #include <linux/threads.h> #include <linux/cpumask.h> #include <linux/seqlock.h> +#include <linux/swait.h> #include <linux/stop_machine.h> /* @@ -149,8 +150,9 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - raw_spinlock_t lock; /* Root rcu_node's lock protects some */ - /* rcu_state fields as well as following. */ + raw_spinlock_t __private lock; /* Root rcu_node's lock protects */ + /* some rcu_state fields as well as */ + /* following. */ unsigned long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ @@ -178,6 +180,8 @@ struct rcu_node { /* beginning of each expedited GP. */ unsigned long expmaskinitnext; /* Online CPUs for next expedited GP. */ + /* Any CPU that has ever been online will */ + /* have its bit set. */ unsigned long grpmask; /* Mask to apply to parent qsmask. */ /* Only one bit will be set in this mask. */ int grplo; /* lowest-numbered CPU or group here. */ @@ -241,7 +245,7 @@ struct rcu_node { /* Refused to boost: not sure why, though. */ /* This can happen due to race conditions. */ #ifdef CONFIG_RCU_NOCB_CPU - wait_queue_head_t nocb_gp_wq[2]; + struct swait_queue_head nocb_gp_wq[2]; /* Place for rcu_nocb_kthread() to wait GP. */ #endif /* #ifdef CONFIG_RCU_NOCB_CPU */ int need_future_gp[2]; @@ -384,6 +388,10 @@ struct rcu_data { struct rcu_head oom_head; #endif /* #ifdef CONFIG_RCU_FAST_NO_HZ */ struct mutex exp_funnel_mutex; + atomic_long_t expedited_workdone0; /* # done by others #0. */ + atomic_long_t expedited_workdone1; /* # done by others #1. */ + atomic_long_t expedited_workdone2; /* # done by others #2. */ + atomic_long_t expedited_workdone3; /* # done by others #3. */ /* 7) Callback offloading. */ #ifdef CONFIG_RCU_NOCB_CPU @@ -393,7 +401,7 @@ struct rcu_data { atomic_long_t nocb_q_count_lazy; /* invocation (all stages). */ struct rcu_head *nocb_follower_head; /* CBs ready to invoke. */ struct rcu_head **nocb_follower_tail; - wait_queue_head_t nocb_wq; /* For nocb kthreads to sleep on. */ + struct swait_queue_head nocb_wq; /* For nocb kthreads to sleep on. */ struct task_struct *nocb_kthread; int nocb_defer_wakeup; /* Defer wakeup of nocb_kthread. */ @@ -472,7 +480,7 @@ struct rcu_state { unsigned long gpnum; /* Current gp number. */ unsigned long completed; /* # of last completed gp. */ struct task_struct *gp_kthread; /* Task for grace periods. */ - wait_queue_head_t gp_wq; /* Where GP task waits. */ + struct swait_queue_head gp_wq; /* Where GP task waits. */ short gp_flags; /* Commands for GP task. */ short gp_state; /* GP kthread sleep state. */ @@ -498,13 +506,9 @@ struct rcu_state { /* End of fields guarded by barrier_mutex. */ unsigned long expedited_sequence; /* Take a ticket. */ - atomic_long_t expedited_workdone0; /* # done by others #0. */ - atomic_long_t expedited_workdone1; /* # done by others #1. */ - atomic_long_t expedited_workdone2; /* # done by others #2. */ - atomic_long_t expedited_workdone3; /* # done by others #3. */ atomic_long_t expedited_normal; /* # fallbacks to normal. */ atomic_t expedited_need_qs; /* # CPUs left to check in. */ - wait_queue_head_t expedited_wq; /* Wait for check-ins. */ + struct swait_queue_head expedited_wq; /* Wait for check-ins. */ int ncpus_snap; /* # CPUs seen last time. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ @@ -545,6 +549,18 @@ struct rcu_state { #define RCU_GP_CLEANUP 5 /* Grace-period cleanup started. */ #define RCU_GP_CLEANED 6 /* Grace-period cleanup complete. */ +#ifndef RCU_TREE_NONCORE +static const char * const gp_state_names[] = { + "RCU_GP_IDLE", + "RCU_GP_WAIT_GPS", + "RCU_GP_DONE_GPS", + "RCU_GP_WAIT_FQS", + "RCU_GP_DOING_FQS", + "RCU_GP_CLEANUP", + "RCU_GP_CLEANED", +}; +#endif /* #ifndef RCU_TREE_NONCORE */ + extern struct list_head rcu_struct_flavors; /* Sequence through rcu_state structures for each RCU flavor. */ @@ -607,7 +623,8 @@ static void zero_cpu_stall_ticks(struct rcu_data *rdp); static void increment_cpu_stall_ticks(void); static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu); static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq); -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp); +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp); +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq); static void rcu_init_one_nocb(struct rcu_node *rnp); static bool __call_rcu_nocb(struct rcu_data *rdp, struct rcu_head *rhp, bool lazy, unsigned long flags); @@ -664,3 +681,61 @@ static inline void rcu_nocb_q_lengths(struct rcu_data *rdp, long *ql, long *qll) #else /* #ifdef CONFIG_PPC */ #define smp_mb__after_unlock_lock() do { } while (0) #endif /* #else #ifdef CONFIG_PPC */ + +/* + * Wrappers for the rcu_node::lock acquire and release. + * + * Because the rcu_nodes form a tree, the tree traversal locking will observe + * different lock values, this in turn means that an UNLOCK of one level + * followed by a LOCK of another level does not imply a full memory barrier; + * and most importantly transitivity is lost. + * + * In order to restore full ordering between tree levels, augment the regular + * lock acquire functions with smp_mb__after_unlock_lock(). + * + * As ->lock of struct rcu_node is a __private field, therefore one should use + * these wrappers rather than directly call raw_spin_{lock,unlock}* on ->lock. + */ +static inline void raw_spin_lock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock(&ACCESS_PRIVATE(rnp, lock)); + smp_mb__after_unlock_lock(); +} + +static inline void raw_spin_unlock_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock(&ACCESS_PRIVATE(rnp, lock)); +} + +static inline void raw_spin_lock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_lock_irq(&ACCESS_PRIVATE(rnp, lock)); + smp_mb__after_unlock_lock(); +} + +static inline void raw_spin_unlock_irq_rcu_node(struct rcu_node *rnp) +{ + raw_spin_unlock_irq(&ACCESS_PRIVATE(rnp, lock)); +} + +#define raw_spin_lock_irqsave_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_lock_irqsave(&ACCESS_PRIVATE(rnp, lock), flags); \ + smp_mb__after_unlock_lock(); \ +} while (0) + +#define raw_spin_unlock_irqrestore_rcu_node(rnp, flags) \ +do { \ + typecheck(unsigned long, flags); \ + raw_spin_unlock_irqrestore(&ACCESS_PRIVATE(rnp, lock), flags); \ +} while (0) + +static inline bool raw_spin_trylock_rcu_node(struct rcu_node *rnp) +{ + bool locked = raw_spin_trylock(&ACCESS_PRIVATE(rnp, lock)); + + if (locked) + smp_mb__after_unlock_lock(); + return locked; +} diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 630c19772630..efdf7b61ce12 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -63,8 +63,7 @@ static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */ /* * Check the RCU kernel configuration parameters and print informative - * messages about anything out of the ordinary. If you like #ifdef, you - * will love this function. + * messages about anything out of the ordinary. */ static void __init rcu_bootup_announce_oddness(void) { @@ -147,8 +146,8 @@ static void __init rcu_bootup_announce(void) * the corresponding expedited grace period will also be the end of the * normal grace period. */ -static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, - unsigned long flags) __releases(rnp->lock) +static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp) + __releases(rnp->lock) /* But leaves rrupts disabled. */ { int blkd_state = (rnp->gp_tasks ? RCU_GP_TASKS : 0) + (rnp->exp_tasks ? RCU_EXP_TASKS : 0) + @@ -236,7 +235,7 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, rnp->gp_tasks = &t->rcu_node_entry; if (!rnp->exp_tasks && (blkd_state & RCU_EXP_BLKD)) rnp->exp_tasks = &t->rcu_node_entry; - raw_spin_unlock(&rnp->lock); + raw_spin_unlock_rcu_node(rnp); /* interrupts remain disabled. */ /* * Report the quiescent state for the expedited GP. This expedited @@ -251,7 +250,6 @@ static void rcu_preempt_ctxt_queue(struct rcu_node *rnp, struct rcu_data *rdp, } else { WARN_ON_ONCE(t->rcu_read_unlock_special.b.exp_need_qs); } - local_irq_restore(flags); } /* @@ -286,12 +284,11 @@ static void rcu_preempt_qs(void) * predating the current grace period drain, in other words, until * rnp->gp_tasks becomes NULL. * - * Caller must disable preemption. + * Caller must disable interrupts. */ static void rcu_preempt_note_context_switch(void) { struct task_struct *t = current; - unsigned long flags; struct rcu_data *rdp; struct rcu_node *rnp; @@ -301,8 +298,7 @@ static void rcu_preempt_note_context_switch(void) /* Possibly blocking in an RCU read-side critical section. */ rdp = this_cpu_ptr(rcu_state_p->rda); rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); t->rcu_read_unlock_special.b.blocked = true; t->rcu_blocked_node = rnp; @@ -318,7 +314,7 @@ static void rcu_preempt_note_context_switch(void) (rnp->qsmask & rdp->grpmask) ? rnp->gpnum : rnp->gpnum + 1); - rcu_preempt_ctxt_queue(rnp, rdp, flags); + rcu_preempt_ctxt_queue(rnp, rdp); } else if (t->rcu_read_lock_nesting < 0 && t->rcu_read_unlock_special.s) { @@ -450,20 +446,13 @@ void rcu_read_unlock_special(struct task_struct *t) /* * Remove this task from the list it blocked on. The task - * now remains queued on the rcu_node corresponding to - * the CPU it first blocked on, so the first attempt to - * acquire the task's rcu_node's ->lock will succeed. - * Keep the loop and add a WARN_ON() out of sheer paranoia. + * now remains queued on the rcu_node corresponding to the + * CPU it first blocked on, so there is no longer any need + * to loop. Retain a WARN_ON_ONCE() out of sheer paranoia. */ - for (;;) { - rnp = t->rcu_blocked_node; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); - if (rnp == t->rcu_blocked_node) - break; - WARN_ON_ONCE(1); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ - } + rnp = t->rcu_blocked_node; + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ + WARN_ON_ONCE(rnp != t->rcu_blocked_node); empty_norm = !rcu_preempt_blocked_readers_cgp(rnp); empty_exp = sync_rcu_preempt_exp_done(rnp); smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */ @@ -500,7 +489,7 @@ void rcu_read_unlock_special(struct task_struct *t) !!rnp->gp_tasks); rcu_report_unblock_qs_rnp(rcu_state_p, rnp, flags); } else { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* Unboost if we were boosted. */ @@ -527,16 +516,16 @@ static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) unsigned long flags; struct task_struct *t; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); if (!rcu_preempt_blocked_readers_cgp(rnp)) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } t = list_entry(rnp->gp_tasks->prev, struct task_struct, rcu_node_entry); list_for_each_entry_continue(t, &rnp->blkd_tasks, rcu_node_entry) sched_show_task(t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } /* @@ -748,6 +737,12 @@ void synchronize_rcu_expedited(void) struct rcu_state *rsp = rcu_state_p; unsigned long s; + /* If expedited grace periods are prohibited, fall back to normal. */ + if (rcu_gp_is_normal()) { + wait_rcu_gp(call_rcu); + return; + } + s = rcu_exp_gp_seq_snap(rsp); rnp_unlock = exp_funnel_lock(rsp, s); @@ -788,7 +783,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier); */ static void __init __rcu_init_preempt(void) { - rcu_init_one(rcu_state_p, rcu_data_p); + rcu_init_one(rcu_state_p); } /* @@ -812,7 +807,6 @@ void exit_rcu(void) #else /* #ifdef CONFIG_PREEMPT_RCU */ static struct rcu_state *const rcu_state_p = &rcu_sched_state; -static struct rcu_data __percpu *const rcu_data_p = &rcu_sched_data; /* * Tell them what RCU they are running. @@ -989,15 +983,14 @@ static int rcu_boost(struct rcu_node *rnp) READ_ONCE(rnp->boost_tasks) == NULL) return 0; /* Nothing left to boost. */ - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); /* * Recheck under the lock: all tasks in need of boosting * might exit their RCU read-side critical sections on their own. */ if (rnp->exp_tasks == NULL && rnp->boost_tasks == NULL) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return 0; } @@ -1034,7 +1027,7 @@ static int rcu_boost(struct rcu_node *rnp) */ t = container_of(tb, struct task_struct, rcu_node_entry); rt_mutex_init_proxy_locked(&rnp->boost_mtx, t); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); /* Lock only for side effect: boosts task t's priority. */ rt_mutex_lock(&rnp->boost_mtx); rt_mutex_unlock(&rnp->boost_mtx); /* Then keep lockdep happy. */ @@ -1094,7 +1087,7 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) if (!rcu_preempt_blocked_readers_cgp(rnp) && rnp->exp_tasks == NULL) { rnp->n_balk_exp_gp_tasks++; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); return; } if (rnp->exp_tasks != NULL || @@ -1104,13 +1097,13 @@ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) ULONG_CMP_GE(jiffies, rnp->boost_time))) { if (rnp->exp_tasks == NULL) rnp->boost_tasks = rnp->gp_tasks; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); t = rnp->boost_kthread_task; if (t) rcu_wake_cond(t, rnp->boost_kthread_status); } else { rcu_initiate_boost_trace(rnp); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } } @@ -1176,10 +1169,9 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp, "rcub/%d", rnp_index); if (IS_ERR(t)) return PTR_ERR(t); - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); rnp->boost_kthread_task = t; - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); sp.sched_priority = kthread_prio; sched_setscheduler_nocheck(t, SCHED_FIFO, &sp); wake_up_process(t); /* get to TASK_INTERRUPTIBLE quickly. */ @@ -1315,7 +1307,7 @@ static void rcu_prepare_kthreads(int cpu) static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) __releases(rnp->lock) { - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } static void invoke_rcu_callbacks_kthread(void) @@ -1524,7 +1516,8 @@ static void rcu_prepare_for_idle(void) struct rcu_state *rsp; int tne; - if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL)) + if (IS_ENABLED(CONFIG_RCU_NOCB_CPU_ALL) || + rcu_is_nocb_cpu(smp_processor_id())) return; /* Handle nohz enablement switches conservatively. */ @@ -1538,10 +1531,6 @@ static void rcu_prepare_for_idle(void) if (!tne) return; - /* If this is a no-CBs CPU, no callbacks, just return. */ - if (rcu_is_nocb_cpu(smp_processor_id())) - return; - /* * If a non-lazy callback arrived at a CPU having only lazy * callbacks, invoke RCU core for the side-effect of recalculating @@ -1567,10 +1556,9 @@ static void rcu_prepare_for_idle(void) if (!*rdp->nxttail[RCU_DONE_TAIL]) continue; rnp = rdp->mynode; - raw_spin_lock(&rnp->lock); /* irqs already disabled. */ - smp_mb__after_unlock_lock(); + raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */ needwake = rcu_accelerate_cbs(rsp, rnp, rdp); - raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */ if (needwake) rcu_gp_kthread_wake(rsp); } @@ -1822,9 +1810,9 @@ early_param("rcu_nocb_poll", parse_rcu_nocb_poll); * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended * grace period. */ -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { - wake_up_all(&rnp->nocb_gp_wq[rnp->completed & 0x1]); + swake_up_all(sq); } /* @@ -1840,10 +1828,15 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) rnp->need_future_gp[(rnp->completed + 1) & 0x1] += nrq; } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return &rnp->nocb_gp_wq[rnp->completed & 0x1]; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { - init_waitqueue_head(&rnp->nocb_gp_wq[0]); - init_waitqueue_head(&rnp->nocb_gp_wq[1]); + init_swait_queue_head(&rnp->nocb_gp_wq[0]); + init_swait_queue_head(&rnp->nocb_gp_wq[1]); } #ifndef CONFIG_RCU_NOCB_CPU_ALL @@ -1868,7 +1861,7 @@ static void wake_nocb_leader(struct rcu_data *rdp, bool force) if (READ_ONCE(rdp_leader->nocb_leader_sleep) || force) { /* Prior smp_mb__after_atomic() orders against prior enqueue. */ WRITE_ONCE(rdp_leader->nocb_leader_sleep, false); - wake_up(&rdp_leader->nocb_wq); + swake_up(&rdp_leader->nocb_wq); } } @@ -2068,10 +2061,9 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) bool needwake; struct rcu_node *rnp = rdp->mynode; - raw_spin_lock_irqsave(&rnp->lock, flags); - smp_mb__after_unlock_lock(); + raw_spin_lock_irqsave_rcu_node(rnp, flags); needwake = rcu_start_future_gp(rnp, rdp, &c); - raw_spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); if (needwake) rcu_gp_kthread_wake(rdp->rsp); @@ -2081,7 +2073,7 @@ static void rcu_nocb_wait_gp(struct rcu_data *rdp) */ trace_rcu_future_gp(rnp, rdp, c, TPS("StartWait")); for (;;) { - wait_event_interruptible( + swait_event_interruptible( rnp->nocb_gp_wq[c & 0x1], (d = ULONG_CMP_GE(READ_ONCE(rnp->completed), c))); if (likely(d)) @@ -2109,7 +2101,7 @@ wait_again: /* Wait for callbacks to appear. */ if (!rcu_nocb_poll) { trace_rcu_nocb_wake(my_rdp->rsp->name, my_rdp->cpu, "Sleep"); - wait_event_interruptible(my_rdp->nocb_wq, + swait_event_interruptible(my_rdp->nocb_wq, !READ_ONCE(my_rdp->nocb_leader_sleep)); /* Memory barrier handled by smp_mb() calls below and repoll. */ } else if (firsttime) { @@ -2184,7 +2176,7 @@ wait_again: * List was empty, wake up the follower. * Memory barriers supplied by atomic_long_add(). */ - wake_up(&rdp->nocb_wq); + swake_up(&rdp->nocb_wq); } } @@ -2205,7 +2197,7 @@ static void nocb_follower_wait(struct rcu_data *rdp) if (!rcu_nocb_poll) { trace_rcu_nocb_wake(rdp->rsp->name, rdp->cpu, "FollowerSleep"); - wait_event_interruptible(rdp->nocb_wq, + swait_event_interruptible(rdp->nocb_wq, READ_ONCE(rdp->nocb_follower_head)); } else if (firsttime) { /* Don't drown trace log with "Poll"! */ @@ -2364,7 +2356,7 @@ void __init rcu_init_nohz(void) static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp) { rdp->nocb_tail = &rdp->nocb_head; - init_waitqueue_head(&rdp->nocb_wq); + init_swait_queue_head(&rdp->nocb_wq); rdp->nocb_follower_tail = &rdp->nocb_follower_head; } @@ -2514,7 +2506,7 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu) return false; } -static void rcu_nocb_gp_cleanup(struct rcu_state *rsp, struct rcu_node *rnp) +static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq) { } @@ -2522,6 +2514,11 @@ static void rcu_nocb_gp_set(struct rcu_node *rnp, int nrq) { } +static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp) +{ + return NULL; +} + static void rcu_init_one_nocb(struct rcu_node *rnp) { } diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c index ef7093cc9b5c..1088e64f01ad 100644 --- a/kernel/rcu/tree_trace.c +++ b/kernel/rcu/tree_trace.c @@ -1,5 +1,5 @@ /* - * Read-Copy Update tracing for classic implementation + * Read-Copy Update tracing for hierarchical implementation. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by @@ -16,6 +16,7 @@ * http://www.gnu.org/licenses/gpl-2.0.html. * * Copyright IBM Corporation, 2008 + * Author: Paul E. McKenney * * Papers: http://www.rdrop.com/users/paulmck/RCU * @@ -33,9 +34,7 @@ #include <linux/sched.h> #include <linux/atomic.h> #include <linux/bitops.h> -#include <linux/module.h> #include <linux/completion.h> -#include <linux/moduleparam.h> #include <linux/percpu.h> #include <linux/notifier.h> #include <linux/cpu.h> @@ -183,14 +182,20 @@ static const struct file_operations rcudata_fops = { static int show_rcuexp(struct seq_file *m, void *v) { + int cpu; struct rcu_state *rsp = (struct rcu_state *)m->private; - + struct rcu_data *rdp; + unsigned long s0 = 0, s1 = 0, s2 = 0, s3 = 0; + + for_each_possible_cpu(cpu) { + rdp = per_cpu_ptr(rsp->rda, cpu); + s0 += atomic_long_read(&rdp->expedited_workdone0); + s1 += atomic_long_read(&rdp->expedited_workdone1); + s2 += atomic_long_read(&rdp->expedited_workdone2); + s3 += atomic_long_read(&rdp->expedited_workdone3); + } seq_printf(m, "s=%lu wd0=%lu wd1=%lu wd2=%lu wd3=%lu n=%lu enq=%d sc=%lu\n", - rsp->expedited_sequence, - atomic_long_read(&rsp->expedited_workdone0), - atomic_long_read(&rsp->expedited_workdone1), - atomic_long_read(&rsp->expedited_workdone2), - atomic_long_read(&rsp->expedited_workdone3), + rsp->expedited_sequence, s0, s1, s2, s3, atomic_long_read(&rsp->expedited_normal), atomic_read(&rsp->expedited_need_qs), rsp->expedited_sequence / 2); @@ -319,7 +324,7 @@ static void show_one_rcugp(struct seq_file *m, struct rcu_state *rsp) unsigned long gpmax; struct rcu_node *rnp = &rsp->node[0]; - raw_spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave_rcu_node(rnp, flags); completed = READ_ONCE(rsp->completed); gpnum = READ_ONCE(rsp->gpnum); if (completed == gpnum) @@ -487,16 +492,4 @@ free_out: debugfs_remove_recursive(rcudir); return 1; } - -static void __exit rcutree_trace_cleanup(void) -{ - debugfs_remove_recursive(rcudir); -} - - -module_init(rcutree_trace_init); -module_exit(rcutree_trace_cleanup); - -MODULE_AUTHOR("Paul E. McKenney"); -MODULE_DESCRIPTION("Read-Copy Update tracing for hierarchical implementation"); -MODULE_LICENSE("GPL"); +device_initcall(rcutree_trace_init); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 5f748c5a40f0..ca828b41c938 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -60,7 +60,12 @@ MODULE_ALIAS("rcupdate"); #endif #define MODULE_PARAM_PREFIX "rcupdate." +#ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); +module_param(rcu_normal, int, 0); +static int rcu_normal_after_boot; +module_param(rcu_normal_after_boot, int, 0); +#endif /* #ifndef CONFIG_TINY_RCU */ #if defined(CONFIG_DEBUG_LOCK_ALLOC) && defined(CONFIG_PREEMPT_COUNT) /** @@ -113,6 +118,18 @@ EXPORT_SYMBOL(rcu_read_lock_sched_held); #ifndef CONFIG_TINY_RCU +/* + * Should expedited grace-period primitives always fall back to their + * non-expedited counterparts? Intended for use within RCU. Note + * that if the user specifies both rcu_expedited and rcu_normal, then + * rcu_normal wins. + */ +bool rcu_gp_is_normal(void) +{ + return READ_ONCE(rcu_normal); +} +EXPORT_SYMBOL_GPL(rcu_gp_is_normal); + static atomic_t rcu_expedited_nesting = ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0); @@ -157,8 +174,6 @@ void rcu_unexpedite_gp(void) } EXPORT_SYMBOL_GPL(rcu_unexpedite_gp); -#endif /* #ifndef CONFIG_TINY_RCU */ - /* * Inform RCU of the end of the in-kernel boot sequence. */ @@ -166,8 +181,12 @@ void rcu_end_inkernel_boot(void) { if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT)) rcu_unexpedite_gp(); + if (rcu_normal_after_boot) + WRITE_ONCE(rcu_normal, 1); } +#endif /* #ifndef CONFIG_TINY_RCU */ + #ifdef CONFIG_PREEMPT_RCU /* diff --git a/kernel/relay.c b/kernel/relay.c index 0b4570cfacae..074994bcfa9b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&file_inode(filp)->i_mutex); + inode_lock(file_inode(filp)); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&file_inode(filp)->i_mutex); + inode_unlock(file_inode(filp)); return desc->written; } diff --git a/kernel/resource.c b/kernel/resource.c index f150dbbe6f62..9b5f04404152 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -105,16 +105,25 @@ static int r_show(struct seq_file *m, void *v) { struct resource *root = m->private; struct resource *r = v, *p; + unsigned long long start, end; int width = root->end < 0x10000 ? 4 : 8; int depth; for (depth = 0, p = r; depth < MAX_IORES_LEVEL; depth++, p = p->parent) if (p->parent == root) break; + + if (file_ns_capable(m->file, &init_user_ns, CAP_SYS_ADMIN)) { + start = r->start; + end = r->end; + } else { + start = end = 0; + } + seq_printf(m, "%*s%0*llx-%0*llx : %s\n", depth * 2, "", - width, (unsigned long long) r->start, - width, (unsigned long long) r->end, + width, start, + width, end, r->name ? r->name : "<BAD>"); return 0; } @@ -233,9 +242,9 @@ static struct resource * __request_resource(struct resource *root, struct resour } } -static int __release_resource(struct resource *old) +static int __release_resource(struct resource *old, bool release_child) { - struct resource *tmp, **p; + struct resource *tmp, **p, *chd; p = &old->parent->child; for (;;) { @@ -243,7 +252,17 @@ static int __release_resource(struct resource *old) if (!tmp) break; if (tmp == old) { - *p = tmp->sibling; + if (release_child || !(tmp->child)) { + *p = tmp->sibling; + } else { + for (chd = tmp->child;; chd = chd->sibling) { + chd->parent = tmp->parent; + if (!(chd->sibling)) + break; + } + *p = tmp->child; + chd->sibling = tmp->sibling; + } old->parent = NULL; return 0; } @@ -325,7 +344,7 @@ int release_resource(struct resource *old) int retval; write_lock(&resource_lock); - retval = __release_resource(old); + retval = __release_resource(old, true); write_unlock(&resource_lock); return retval; } @@ -333,13 +352,13 @@ int release_resource(struct resource *old) EXPORT_SYMBOL(release_resource); /* - * Finds the lowest iomem reosurce exists with-in [res->start.res->end) - * the caller must specify res->start, res->end, res->flags and "name". - * If found, returns 0, res is overwritten, if not found, returns -1. - * This walks through whole tree and not just first level children - * until and unless first_level_children_only is true. + * Finds the lowest iomem resource existing within [res->start.res->end). + * The caller must specify res->start, res->end, res->flags, and optionally + * desc. If found, returns 0, res is overwritten, if not found, returns -1. + * This function walks the whole tree and not just first level children until + * and unless first_level_children_only is true. */ -static int find_next_iomem_res(struct resource *res, char *name, +static int find_next_iomem_res(struct resource *res, unsigned long desc, bool first_level_children_only) { resource_size_t start, end; @@ -358,9 +377,9 @@ static int find_next_iomem_res(struct resource *res, char *name, read_lock(&resource_lock); for (p = iomem_resource.child; p; p = next_resource(p, sibling_only)) { - if (p->flags != res->flags) + if ((p->flags & res->flags) != res->flags) continue; - if (name && strcmp(p->name, name)) + if ((desc != IORES_DESC_NONE) && (desc != p->desc)) continue; if (p->start > end) { p = NULL; @@ -385,15 +404,18 @@ static int find_next_iomem_res(struct resource *res, char *name, * Walks through iomem resources and calls func() with matching resource * ranges. This walks through whole tree and not just first level children. * All the memory ranges which overlap start,end and also match flags and - * name are valid candidates. + * desc are valid candidates. * - * @name: name of resource - * @flags: resource flags + * @desc: I/O resource descriptor. Use IORES_DESC_NONE to skip @desc check. + * @flags: I/O resource flags * @start: start addr * @end: end addr + * + * NOTE: For a new descriptor search, define a new IORES_DESC in + * <linux/ioport.h> and set it in 'desc' of a target resource entry. */ -int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, - void *arg, int (*func)(u64, u64, void *)) +int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, + u64 end, void *arg, int (*func)(u64, u64, void *)) { struct resource res; u64 orig_end; @@ -403,23 +425,27 @@ int walk_iomem_res(char *name, unsigned long flags, u64 start, u64 end, res.end = end; res.flags = flags; orig_end = res.end; + while ((res.start < res.end) && - (!find_next_iomem_res(&res, name, false))) { + (!find_next_iomem_res(&res, desc, false))) { + ret = (*func)(res.start, res.end, arg); if (ret) break; + res.start = res.end + 1; res.end = orig_end; } + return ret; } /* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". This function deals with - * full ranges and not pfn. If resources are not pfn aligned, dealing - * with pfn can truncate ranges. + * This function calls the @func callback against all memory ranges of type + * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. + * Now, this function is only for System RAM, it deals with full ranges and + * not PFNs. If resources are not PFN-aligned, dealing with PFNs can truncate + * ranges. */ int walk_system_ram_res(u64 start, u64 end, void *arg, int (*func)(u64, u64, void *)) @@ -430,10 +456,10 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, res.start = start; res.end = end; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (!find_next_iomem_res(&res, "System RAM", true))) { + (!find_next_iomem_res(&res, IORES_DESC_NONE, true))) { ret = (*func)(res.start, res.end, arg); if (ret) break; @@ -446,9 +472,9 @@ int walk_system_ram_res(u64 start, u64 end, void *arg, #if !defined(CONFIG_ARCH_HAS_WALK_MEMORY) /* - * This function calls callback against all memory range of "System RAM" - * which are marked as IORESOURCE_MEM and IORESOUCE_BUSY. - * Now, this function is only for "System RAM". + * This function calls the @func callback against all memory ranges of type + * System RAM which are marked as IORESOURCE_SYSTEM_RAM and IORESOUCE_BUSY. + * It is to be used only for System RAM. */ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, void *arg, int (*func)(unsigned long, unsigned long, void *)) @@ -460,10 +486,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, res.start = (u64) start_pfn << PAGE_SHIFT; res.end = ((u64)(start_pfn + nr_pages) << PAGE_SHIFT) - 1; - res.flags = IORESOURCE_MEM | IORESOURCE_BUSY; + res.flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY; orig_end = res.end; while ((res.start < res.end) && - (find_next_iomem_res(&res, "System RAM", true) >= 0)) { + (find_next_iomem_res(&res, IORES_DESC_NONE, true) >= 0)) { pfn = (res.start + PAGE_SIZE - 1) >> PAGE_SHIFT; end_pfn = (res.end + 1) >> PAGE_SHIFT; if (end_pfn > pfn) @@ -484,7 +510,7 @@ static int __is_ram(unsigned long pfn, unsigned long nr_pages, void *arg) } /* * This generic page_is_ram() returns true if specified address is - * registered as "System RAM" in iomem_resource list. + * registered as System RAM in iomem_resource list. */ int __weak page_is_ram(unsigned long pfn) { @@ -496,30 +522,34 @@ EXPORT_SYMBOL_GPL(page_is_ram); * region_intersects() - determine intersection of region with known resources * @start: region start address * @size: size of region - * @name: name of resource (in iomem_resource) + * @flags: flags of resource (in iomem_resource) + * @desc: descriptor of resource (in iomem_resource) or IORES_DESC_NONE * * Check if the specified region partially overlaps or fully eclipses a - * resource identified by @name. Return REGION_DISJOINT if the region - * does not overlap @name, return REGION_MIXED if the region overlaps - * @type and another resource, and return REGION_INTERSECTS if the - * region overlaps @type and no other defined resource. Note, that - * REGION_INTERSECTS is also returned in the case when the specified - * region overlaps RAM and undefined memory holes. + * resource identified by @flags and @desc (optional with IORES_DESC_NONE). + * Return REGION_DISJOINT if the region does not overlap @flags/@desc, + * return REGION_MIXED if the region overlaps @flags/@desc and another + * resource, and return REGION_INTERSECTS if the region overlaps @flags/@desc + * and no other defined resource. Note that REGION_INTERSECTS is also + * returned in the case when the specified region overlaps RAM and undefined + * memory holes. * * region_intersect() is used by memory remapping functions to ensure * the user is not remapping RAM and is a vast speed up over walking * through the resource table page by page. */ -int region_intersects(resource_size_t start, size_t size, const char *name) +int region_intersects(resource_size_t start, size_t size, unsigned long flags, + unsigned long desc) { - unsigned long flags = IORESOURCE_MEM | IORESOURCE_BUSY; resource_size_t end = start + size - 1; int type = 0; int other = 0; struct resource *p; read_lock(&resource_lock); for (p = iomem_resource.child; p ; p = p->sibling) { - bool is_type = strcmp(p->name, name) == 0 && p->flags == flags; + bool is_type = (((p->flags & flags) == flags) && + ((desc == IORES_DESC_NONE) || + (desc == p->desc))); if (start >= p->start && start <= p->end) is_type ? type++ : other++; @@ -538,6 +568,7 @@ int region_intersects(resource_size_t start, size_t size, const char *name) return REGION_DISJOINT; } +EXPORT_SYMBOL_GPL(region_intersects); void __weak arch_remove_reservations(struct resource *avail) { @@ -667,7 +698,7 @@ static int reallocate_resource(struct resource *root, struct resource *old, old->start = new.start; old->end = new.end; } else { - __release_resource(old); + __release_resource(old, true); *old = new; conflict = __request_resource(root, old); BUG_ON(conflict); @@ -813,6 +844,9 @@ static struct resource * __insert_resource(struct resource *parent, struct resou * entirely fit within the range of the new resource, then the new * resource is inserted and the conflicting resources become children of * the new resource. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ struct resource *insert_resource_conflict(struct resource *parent, struct resource *new) { @@ -830,6 +864,9 @@ struct resource *insert_resource_conflict(struct resource *parent, struct resour * @new: new resource to insert * * Returns 0 on success, -EBUSY if the resource can't be inserted. + * + * This function is intended for producers of resources, such as FW modules + * and bus drivers. */ int insert_resource(struct resource *parent, struct resource *new) { @@ -838,6 +875,7 @@ int insert_resource(struct resource *parent, struct resource *new) conflict = insert_resource_conflict(parent, new); return conflict ? -EBUSY : 0; } +EXPORT_SYMBOL_GPL(insert_resource); /** * insert_resource_expand_to_fit - Insert a resource into the resource tree @@ -873,6 +911,32 @@ void insert_resource_expand_to_fit(struct resource *root, struct resource *new) write_unlock(&resource_lock); } +/** + * remove_resource - Remove a resource in the resource tree + * @old: resource to remove + * + * Returns 0 on success, -EINVAL if the resource is not valid. + * + * This function removes a resource previously inserted by insert_resource() + * or insert_resource_conflict(), and moves the children (if any) up to + * where they were before. insert_resource() and insert_resource_conflict() + * insert a new resource, and move any conflicting resources down to the + * children of the new resource. + * + * insert_resource(), insert_resource_conflict() and remove_resource() are + * intended for producers of resources, such as FW modules and bus drivers. + */ +int remove_resource(struct resource *old) +{ + int retval; + + write_lock(&resource_lock); + retval = __release_resource(old, false); + write_unlock(&resource_lock); + return retval; +} +EXPORT_SYMBOL_GPL(remove_resource); + static int __adjust_resource(struct resource *res, resource_size_t start, resource_size_t size) { @@ -948,6 +1012,7 @@ static void __init __reserve_region_with_split(struct resource *root, res->start = start; res->end = end; res->flags = IORESOURCE_BUSY; + res->desc = IORES_DESC_NONE; while (1) { @@ -982,6 +1047,7 @@ static void __init __reserve_region_with_split(struct resource *root, next_res->start = conflict->end + 1; next_res->end = end; next_res->flags = IORESOURCE_BUSY; + next_res->desc = IORES_DESC_NONE; } } else { res->start = conflict->end + 1; @@ -1071,21 +1137,24 @@ struct resource * __request_region(struct resource *parent, res->name = name; res->start = start; res->end = start + n - 1; - res->flags = resource_type(parent); - res->flags |= IORESOURCE_BUSY | flags; write_lock(&resource_lock); for (;;) { struct resource *conflict; + res->flags = resource_type(parent) | resource_ext_type(parent); + res->flags |= IORESOURCE_BUSY | flags; + res->desc = parent->desc; + conflict = __request_resource(parent, res); if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); @@ -1237,6 +1306,7 @@ int release_mem_region_adjustable(struct resource *parent, new_res->start = end + 1; new_res->end = res->end; new_res->flags = res->flags; + new_res->desc = res->desc; new_res->parent = res->parent; new_res->sibling = res->sibling; new_res->child = NULL; @@ -1412,6 +1482,7 @@ static int __init reserve_setup(char *str) res->start = io_start; res->end = io_start + io_num - 1; res->flags = IORESOURCE_BUSY; + res->desc = IORES_DESC_NONE; res->child = NULL; if (request_resource(res->start >= 0x10000 ? &iomem_resource : &ioport_resource, res) == 0) reserved = x+1; @@ -1498,8 +1569,15 @@ int iomem_is_exclusive(u64 addr) break; if (p->end < addr) continue; - if (p->flags & IORESOURCE_BUSY && - p->flags & IORESOURCE_EXCLUSIVE) { + /* + * A resource is exclusive if IORESOURCE_EXCLUSIVE is set + * or CONFIG_IO_STRICT_DEVMEM is enabled and the + * resource is busy. + */ + if ((p->flags & IORESOURCE_BUSY) == 0) + continue; + if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) + || p->flags & IORESOURCE_EXCLUSIVE) { err = 1; break; } diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 67687973ce80..414d9c16da42 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -2,6 +2,10 @@ ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_clock.o = $(CC_FLAGS_FTRACE) endif +# These files are disabled because they produce non-interesting flaky coverage +# that is not a function of syscall inputs. E.g. involuntary context switches. +KCOV_INSTRUMENT := n + ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is # needed for x86 only. Why this used to be enabled for all architectures is beyond @@ -13,9 +17,10 @@ endif obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o stop_task.o -obj-y += wait.o completion.o idle.o +obj-y += wait.o swait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o obj-$(CONFIG_SCHED_AUTOGROUP) += auto_group.o obj-$(CONFIG_SCHEDSTATS) += stats.o obj-$(CONFIG_SCHED_DEBUG) += debug.o obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o +obj-$(CONFIG_CPU_FREQ) += cpufreq.o diff --git a/kernel/sched/auto_group.c b/kernel/sched/auto_group.c index 750ed601ddf7..a5d966cb8891 100644 --- a/kernel/sched/auto_group.c +++ b/kernel/sched/auto_group.c @@ -212,7 +212,7 @@ int proc_sched_autogroup_set_nice(struct task_struct *p, int nice) ag = autogroup_task_get(p); down_write(&ag->lock); - err = sched_group_set_shares(ag->tg, prio_to_weight[nice + 20]); + err = sched_group_set_shares(ag->tg, sched_prio_to_weight[nice + 20]); if (!err) ag->nice = nice; up_write(&ag->lock); diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c0a205101c23..fedb967a9841 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -1,7 +1,7 @@ /* * sched_clock for unstable cpu clocks * - * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra * * Updates and enhancements: * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> @@ -61,6 +61,7 @@ #include <linux/static_key.h> #include <linux/workqueue.h> #include <linux/compiler.h> +#include <linux/tick.h> /* * Scheduler clock - returns current time in nanosec units. @@ -89,6 +90,8 @@ static void __set_sched_clock_stable(void) { if (!sched_clock_stable()) static_key_slow_inc(&__sched_clock_stable); + + tick_dep_clear(TICK_DEP_BIT_CLOCK_UNSTABLE); } void set_sched_clock_stable(void) @@ -108,6 +111,8 @@ static void __clear_sched_clock_stable(struct work_struct *work) /* XXX worry about clock continuity */ if (sched_clock_stable()) static_key_slow_dec(&__sched_clock_stable); + + tick_dep_set(TICK_DEP_BIT_CLOCK_UNSTABLE); } static DECLARE_WORK(sched_clock_work, __clear_sched_clock_stable); @@ -354,7 +359,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) return; sched_clock_tick(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4d568ac9319e..8b489fcac37b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -26,6 +26,7 @@ * Thomas Gleixner, Mike Kravetz */ +#include <linux/kasan.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/nmi.h> @@ -66,14 +67,13 @@ #include <linux/pagemap.h> #include <linux/hrtimer.h> #include <linux/tick.h> -#include <linux/debugfs.h> #include <linux/ctype.h> #include <linux/ftrace.h> #include <linux/slab.h> #include <linux/init_task.h> -#include <linux/binfmts.h> #include <linux/context_tracking.h> #include <linux/compiler.h> +#include <linux/frame.h> #include <asm/switch_to.h> #include <asm/tlb.h> @@ -124,138 +124,6 @@ const_debug unsigned int sysctl_sched_features = #undef SCHED_FEAT -#ifdef CONFIG_SCHED_DEBUG -#define SCHED_FEAT(name, enabled) \ - #name , - -static const char * const sched_feat_names[] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static int sched_feat_show(struct seq_file *m, void *v) -{ - int i; - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (!(sysctl_sched_features & (1UL << i))) - seq_puts(m, "NO_"); - seq_printf(m, "%s ", sched_feat_names[i]); - } - seq_puts(m, "\n"); - - return 0; -} - -#ifdef HAVE_JUMP_LABEL - -#define jump_label_key__true STATIC_KEY_INIT_TRUE -#define jump_label_key__false STATIC_KEY_INIT_FALSE - -#define SCHED_FEAT(name, enabled) \ - jump_label_key__##enabled , - -struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { -#include "features.h" -}; - -#undef SCHED_FEAT - -static void sched_feat_disable(int i) -{ - static_key_disable(&sched_feat_keys[i]); -} - -static void sched_feat_enable(int i) -{ - static_key_enable(&sched_feat_keys[i]); -} -#else -static void sched_feat_disable(int i) { }; -static void sched_feat_enable(int i) { }; -#endif /* HAVE_JUMP_LABEL */ - -static int sched_feat_set(char *cmp) -{ - int i; - int neg = 0; - - if (strncmp(cmp, "NO_", 3) == 0) { - neg = 1; - cmp += 3; - } - - for (i = 0; i < __SCHED_FEAT_NR; i++) { - if (strcmp(cmp, sched_feat_names[i]) == 0) { - if (neg) { - sysctl_sched_features &= ~(1UL << i); - sched_feat_disable(i); - } else { - sysctl_sched_features |= (1UL << i); - sched_feat_enable(i); - } - break; - } - } - - return i; -} - -static ssize_t -sched_feat_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - char *cmp; - int i; - struct inode *inode; - - if (cnt > 63) - cnt = 63; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - cmp = strstrip(buf); - - /* Ensure the static_key remains in a consistent state */ - inode = file_inode(filp); - mutex_lock(&inode->i_mutex); - i = sched_feat_set(cmp); - mutex_unlock(&inode->i_mutex); - if (i == __SCHED_FEAT_NR) - return -EINVAL; - - *ppos += cnt; - - return cnt; -} - -static int sched_feat_open(struct inode *inode, struct file *filp) -{ - return single_open(filp, sched_feat_show, NULL); -} - -static const struct file_operations sched_feat_fops = { - .open = sched_feat_open, - .write = sched_feat_write, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -static __init int sched_init_debug(void) -{ - debugfs_create_file("sched_features", 0644, NULL, NULL, - &sched_feat_fops); - - return 0; -} -late_initcall(sched_init_debug); -#endif /* CONFIG_SCHED_DEBUG */ - /* * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. @@ -456,15 +324,19 @@ static inline void init_hrtick(void) /* * cmpxchg based fetch_or, macro so it works for different integer types */ -#define fetch_or(ptr, val) \ -({ typeof(*(ptr)) __old, __val = *(ptr); \ - for (;;) { \ - __old = cmpxchg((ptr), __val, __val | (val)); \ - if (__old == __val) \ - break; \ - __val = __old; \ - } \ - __old; \ +#define fetch_or(ptr, mask) \ + ({ \ + typeof(ptr) _ptr = (ptr); \ + typeof(mask) _mask = (mask); \ + typeof(*_ptr) _old, _val = *_ptr; \ + \ + for (;;) { \ + _old = cmpxchg(_ptr, _val, _val | _mask); \ + if (_old == _val) \ + break; \ + _val = _old; \ + } \ + _old; \ }) #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) @@ -715,31 +587,36 @@ static inline bool got_nohz_idle_kick(void) #endif /* CONFIG_NO_HZ_COMMON */ #ifdef CONFIG_NO_HZ_FULL -bool sched_can_stop_tick(void) +bool sched_can_stop_tick(struct rq *rq) { + int fifo_nr_running; + + /* Deadline tasks, even if single, need the tick */ + if (rq->dl.dl_nr_running) + return false; + /* - * FIFO realtime policy runs the highest priority task. Other runnable - * tasks are of a lower priority. The scheduler tick does nothing. + * FIFO realtime policy runs the highest priority task (after DEADLINE). + * Other runnable tasks are of a lower priority. The scheduler tick + * isn't needed. */ - if (current->policy == SCHED_FIFO) + fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running; + if (fifo_nr_running) return true; /* * Round-robin realtime tasks time slice with other tasks at the same - * realtime priority. Is this task the only one at this priority? + * realtime priority. */ - if (current->policy == SCHED_RR) { - struct sched_rt_entity *rt_se = ¤t->rt; - - return rt_se->run_list.prev == rt_se->run_list.next; + if (rq->rt.rr_nr_running) { + if (rq->rt.rr_nr_running == 1) + return true; + else + return false; } - /* - * More than one running task need preemption. - * nr_running update is assumed to be visible - * after IPI is sent from wakers. - */ - if (this_rq()->nr_running > 1) + /* Normal multitasking need periodic preemption checks */ + if (rq->cfs.nr_running > 1) return false; return true; @@ -823,8 +700,8 @@ static void set_load_weight(struct task_struct *p) return; } - load->weight = scale_load(prio_to_weight[prio]); - load->inv_weight = prio_to_wmult[prio]; + load->weight = scale_load(sched_prio_to_weight[prio]); + load->inv_weight = sched_prio_to_wmult[prio]; } static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags) @@ -1071,8 +948,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new { lockdep_assert_held(&rq->lock); - dequeue_task(rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + dequeue_task(rq, p, 0); set_task_cpu(p, new_cpu); raw_spin_unlock(&rq->lock); @@ -1080,8 +957,8 @@ static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new raw_spin_lock(&rq->lock); BUG_ON(task_cpu(p) != new_cpu); - p->on_rq = TASK_ON_RQ_QUEUED; enqueue_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); return rq; @@ -1274,6 +1151,15 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING && !p->on_rq); + /* + * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING, + * because schedstat_wait_{start,end} rebase migrating task's wait_start + * time relying on p->on_rq. + */ + WARN_ON_ONCE(p->state == TASK_RUNNING && + p->sched_class == &fair_sched_class && + (p->on_rq && !task_on_rq_migrating(p))); + #ifdef CONFIG_LOCKDEP /* * The caller should hold either p->pi_lock or rq->lock, when changing @@ -1310,9 +1196,11 @@ static void __migrate_swap_task(struct task_struct *p, int cpu) src_rq = task_rq(p); dst_rq = cpu_rq(cpu); + p->on_rq = TASK_ON_RQ_MIGRATING; deactivate_task(src_rq, p, 0); set_task_cpu(p, cpu); activate_task(dst_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(dst_rq, p, 0); } else { /* @@ -1905,6 +1793,97 @@ static void ttwu_queue(struct task_struct *p, int cpu) raw_spin_unlock(&rq->lock); } +/* + * Notes on Program-Order guarantees on SMP systems. + * + * MIGRATION + * + * The basic program-order guarantee on SMP systems is that when a task [t] + * migrates, all its activity on its old cpu [c0] happens-before any subsequent + * execution on its new cpu [c1]. + * + * For migration (of runnable tasks) this is provided by the following means: + * + * A) UNLOCK of the rq(c0)->lock scheduling out task t + * B) migration for t is required to synchronize *both* rq(c0)->lock and + * rq(c1)->lock (if not at the same time, then in that order). + * C) LOCK of the rq(c1)->lock scheduling in task + * + * Transitivity guarantees that B happens after A and C after B. + * Note: we only require RCpc transitivity. + * Note: the cpu doing B need not be c0 or c1 + * + * Example: + * + * CPU0 CPU1 CPU2 + * + * LOCK rq(0)->lock + * sched-out X + * sched-in Y + * UNLOCK rq(0)->lock + * + * LOCK rq(0)->lock // orders against CPU0 + * dequeue X + * UNLOCK rq(0)->lock + * + * LOCK rq(1)->lock + * enqueue X + * UNLOCK rq(1)->lock + * + * LOCK rq(1)->lock // orders against CPU2 + * sched-out Z + * sched-in X + * UNLOCK rq(1)->lock + * + * + * BLOCKING -- aka. SLEEP + WAKEUP + * + * For blocking we (obviously) need to provide the same guarantee as for + * migration. However the means are completely different as there is no lock + * chain to provide order. Instead we do: + * + * 1) smp_store_release(X->on_cpu, 0) + * 2) smp_cond_acquire(!X->on_cpu) + * + * Example: + * + * CPU0 (schedule) CPU1 (try_to_wake_up) CPU2 (schedule) + * + * LOCK rq(0)->lock LOCK X->pi_lock + * dequeue X + * sched-out X + * smp_store_release(X->on_cpu, 0); + * + * smp_cond_acquire(!X->on_cpu); + * X->state = WAKING + * set_task_cpu(X,2) + * + * LOCK rq(2)->lock + * enqueue X + * X->state = RUNNING + * UNLOCK rq(2)->lock + * + * LOCK rq(2)->lock // orders against CPU1 + * sched-out Z + * sched-in X + * UNLOCK rq(2)->lock + * + * UNLOCK X->pi_lock + * UNLOCK rq(0)->lock + * + * + * However; for wakeups there is a second guarantee we must provide, namely we + * must observe the state that lead to our wakeup. That is, not only must our + * task observe its own prior state, it must also observe the stores prior to + * its wakeup. + * + * This means that any means of doing remote wakeups must order the CPU doing + * the wakeup against the CPU the task is going to end up running on. This, + * however, is already required for the regular Program-Order guarantee above, + * since the waking CPU is the one issueing the ACQUIRE (smp_cond_acquire). + * + */ + /** * try_to_wake_up - wake up a thread * @p: the thread to be awakened @@ -1947,15 +1926,34 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) #ifdef CONFIG_SMP /* - * If the owning (remote) cpu is still in the middle of schedule() with - * this task as prev, wait until its done referencing the task. + * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be + * possible to, falsely, observe p->on_cpu == 0. + * + * One must be running (->on_cpu == 1) in order to remove oneself + * from the runqueue. + * + * [S] ->on_cpu = 1; [L] ->on_rq + * UNLOCK rq->lock + * RMB + * LOCK rq->lock + * [S] ->on_rq = 0; [L] ->on_cpu + * + * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock + * from the consecutive calls to schedule(); the first switching to our + * task, the second putting it to sleep. */ - while (p->on_cpu) - cpu_relax(); + smp_rmb(); + /* - * Pairs with the smp_wmb() in finish_lock_switch(). + * If the owning (remote) cpu is still in the middle of schedule() with + * this task as prev, wait until its done referencing the task. + * + * Pairs with the smp_store_release() in finish_lock_switch(). + * + * This ensures that tasks getting woken will be fully ordered against + * their previous state and preserve Program Order. */ - smp_rmb(); + smp_cond_acquire(!p->on_cpu); p->sched_contributes_to_load = !!task_contributes_to_load(p); p->state = TASK_WAKING; @@ -1972,7 +1970,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ttwu_queue(p, cpu); stat: - ttwu_stat(p, cpu, wake_flags); + if (schedstat_enabled()) + ttwu_stat(p, cpu, wake_flags); out: raw_spin_unlock_irqrestore(&p->pi_lock, flags); @@ -2020,7 +2019,8 @@ static void try_to_wake_up_local(struct task_struct *p) ttwu_activate(rq, p, ENQUEUE_WAKEUP); ttwu_do_wakeup(rq, p, 0); - ttwu_stat(p, smp_processor_id(), 0); + if (schedstat_enabled()) + ttwu_stat(p, smp_processor_id(), 0); out: raw_spin_unlock(&p->pi_lock); } @@ -2039,7 +2039,6 @@ out: */ int wake_up_process(struct task_struct *p) { - WARN_ON(task_is_stopped_or_traced(p)); return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); @@ -2063,7 +2062,6 @@ void __dl_clear_params(struct task_struct *p) dl_se->dl_bw = 0; dl_se->dl_throttled = 0; - dl_se->dl_new = 1; dl_se->dl_yielded = 0; } @@ -2085,7 +2083,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) p->se.vruntime = 0; INIT_LIST_HEAD(&p->se.group_node); +#ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; +#endif + #ifdef CONFIG_SCHEDSTATS + /* Even if schedstat is disabled, there should not be garbage */ memset(&p->se.statistics, 0, sizeof(p->se.statistics)); #endif @@ -2094,6 +2097,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) __dl_clear_params(p); INIT_LIST_HEAD(&p->rt.run_list); + p->rt.timeout = 0; + p->rt.time_slice = sched_rr_timeslice; + p->rt.on_rq = 0; + p->rt.on_list = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -2157,6 +2164,69 @@ int sysctl_numa_balancing(struct ctl_table *table, int write, #endif #endif +DEFINE_STATIC_KEY_FALSE(sched_schedstats); + +#ifdef CONFIG_SCHEDSTATS +static void set_schedstats(bool enabled) +{ + if (enabled) + static_branch_enable(&sched_schedstats); + else + static_branch_disable(&sched_schedstats); +} + +void force_schedstat_enabled(void) +{ + if (!schedstat_enabled()) { + pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); + static_branch_enable(&sched_schedstats); + } +} + +static int __init setup_schedstats(char *str) +{ + int ret = 0; + if (!str) + goto out; + + if (!strcmp(str, "enable")) { + set_schedstats(true); + ret = 1; + } else if (!strcmp(str, "disable")) { + set_schedstats(false); + ret = 1; + } +out: + if (!ret) + pr_warn("Unable to parse schedstats=\n"); + + return ret; +} +__setup("schedstats=", setup_schedstats); + +#ifdef CONFIG_PROC_SYSCTL +int sysctl_schedstats(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + struct ctl_table t; + int err; + int state = static_branch_likely(&sched_schedstats); + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + t = *table; + t.data = &state; + err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); + if (err < 0) + return err; + if (write) + set_schedstats(state); + return err; +} +#endif +#endif + /* * fork()/clone()-time setup: */ @@ -2638,7 +2708,7 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) /* * context_switch - switch to the new MM and the new thread's register state. */ -static inline struct rq * +static __always_inline struct rq * context_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { @@ -2886,16 +2956,6 @@ u64 scheduler_tick_max_deferment(void) } #endif -notrace unsigned long get_parent_ip(unsigned long addr) -{ - if (in_lock_functions(addr)) { - addr = CALLER_ADDR2; - if (in_lock_functions(addr)) - addr = CALLER_ADDR3; - } - return addr; -} - #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ defined(CONFIG_PREEMPT_TRACER)) @@ -2917,7 +2977,7 @@ void preempt_count_add(int val) PREEMPT_MASK - 10); #endif if (preempt_count() == val) { - unsigned long ip = get_parent_ip(CALLER_ADDR1); + unsigned long ip = get_lock_parent_ip(); #ifdef CONFIG_DEBUG_PREEMPT current->preempt_disable_ip = ip; #endif @@ -2944,7 +3004,7 @@ void preempt_count_sub(int val) #endif if (preempt_count() == val) - trace_preempt_on(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); __preempt_count_sub(val); } EXPORT_SYMBOL(preempt_count_sub); @@ -3085,7 +3145,6 @@ static void __sched notrace __schedule(bool preempt) cpu = smp_processor_id(); rq = cpu_rq(cpu); - rcu_note_context_switch(); prev = rq->curr; /* @@ -3104,13 +3163,16 @@ static void __sched notrace __schedule(bool preempt) if (sched_feat(HRTICK)) hrtick_clear(rq); + local_irq_disable(); + rcu_note_context_switch(); + /* * Make sure that signal_pending_state()->signal_pending() below * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) * done by the caller to avoid the race with signal_wake_up(). */ smp_mb__before_spinlock(); - raw_spin_lock_irq(&rq->lock); + raw_spin_lock(&rq->lock); lockdep_pin_lock(&rq->lock); rq->clock_skip_update <<= 1; /* promote REQ to ACT */ @@ -3131,7 +3193,7 @@ static void __sched notrace __schedule(bool preempt) if (prev->flags & PF_WQ_WORKER) { struct task_struct *to_wakeup; - to_wakeup = wq_worker_sleeping(prev, cpu); + to_wakeup = wq_worker_sleeping(prev); if (to_wakeup) try_to_wake_up_local(to_wakeup); } @@ -3154,7 +3216,6 @@ static void __sched notrace __schedule(bool preempt) trace_sched_switch(preempt, prev, next); rq = context_switch(rq, prev, next); /* unlocks the rq */ - cpu = cpu_of(rq); } else { lockdep_unpin_lock(&rq->lock); raw_spin_unlock_irq(&rq->lock); @@ -3162,6 +3223,7 @@ static void __sched notrace __schedule(bool preempt) balance_callback(rq); } +STACK_FRAME_NON_STANDARD(__schedule); /* switch_to() */ static inline void sched_submit_work(struct task_struct *tsk) { @@ -3340,7 +3402,7 @@ EXPORT_SYMBOL(default_wake_function); */ void rt_mutex_setprio(struct task_struct *p, int prio) { - int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE; + int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE; struct rq *rq; const struct sched_class *prev_class; @@ -3368,11 +3430,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio) trace_sched_pi_setprio(p, prio); oldprio = p->prio; + + if (oldprio == prio) + queue_flag &= ~DEQUEUE_MOVE; + prev_class = p->sched_class; queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flag); if (running) put_prev_task(rq, p); @@ -3390,7 +3456,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (!dl_prio(p->normal_prio) || (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) { p->dl.dl_boosted = 1; - enqueue_flag |= ENQUEUE_REPLENISH; + queue_flag |= ENQUEUE_REPLENISH; } else p->dl.dl_boosted = 0; p->sched_class = &dl_sched_class; @@ -3398,7 +3464,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (dl_prio(oldprio)) p->dl.dl_boosted = 0; if (oldprio < prio) - enqueue_flag |= ENQUEUE_HEAD; + queue_flag |= ENQUEUE_HEAD; p->sched_class = &rt_sched_class; } else { if (dl_prio(oldprio)) @@ -3413,7 +3479,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) if (running) p->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, p, enqueue_flag); + enqueue_task(rq, p, queue_flag); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -3769,6 +3835,7 @@ static int __sched_setscheduler(struct task_struct *p, const struct sched_class *prev_class; struct rq *rq; int reset_on_fork; + int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE; /* may grab non-irq protected spin_locks */ BUG_ON(in_interrupt()); @@ -3951,17 +4018,14 @@ change: * itself. */ new_effective_prio = rt_mutex_get_effective_prio(p, newprio); - if (new_effective_prio == oldprio) { - __setscheduler_params(p, attr); - task_rq_unlock(rq, p, &flags); - return 0; - } + if (new_effective_prio == oldprio) + queue_flags &= ~DEQUEUE_MOVE; } queued = task_on_rq_queued(p); running = task_current(rq, p); if (queued) - dequeue_task(rq, p, DEQUEUE_SAVE); + dequeue_task(rq, p, queue_flags); if (running) put_prev_task(rq, p); @@ -3971,15 +4035,14 @@ change: if (running) p->sched_class->set_curr_task(rq); if (queued) { - int enqueue_flags = ENQUEUE_RESTORE; /* * We enqueue to tail when the priority of a task is * increased (user space view). */ - if (oldprio <= p->prio) - enqueue_flags |= ENQUEUE_HEAD; + if (oldprio < p->prio) + queue_flags |= ENQUEUE_HEAD; - enqueue_task(rq, p, enqueue_flags); + enqueue_task(rq, p, queue_flags); } check_class_changed(rq, p, prev_class, oldprio); @@ -4970,6 +5033,8 @@ void init_idle(struct task_struct *idle, int cpu) idle->state = TASK_RUNNING; idle->se.exec_start = sched_clock(); + kasan_unpoison_task_stack(idle); + #ifdef CONFIG_SMP /* * Its possible that init_idle() gets called multiple times on a task, @@ -5279,183 +5344,6 @@ static void migrate_tasks(struct rq *dead_rq) } #endif /* CONFIG_HOTPLUG_CPU */ -#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) - -static struct ctl_table sd_ctl_dir[] = { - { - .procname = "sched_domain", - .mode = 0555, - }, - {} -}; - -static struct ctl_table sd_ctl_root[] = { - { - .procname = "kernel", - .mode = 0555, - .child = sd_ctl_dir, - }, - {} -}; - -static struct ctl_table *sd_alloc_ctl_entry(int n) -{ - struct ctl_table *entry = - kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); - - return entry; -} - -static void sd_free_ctl_entry(struct ctl_table **tablep) -{ - struct ctl_table *entry; - - /* - * In the intermediate directories, both the child directory and - * procname are dynamically allocated and could fail but the mode - * will always be set. In the lowest directory the names are - * static strings and all have proc handlers. - */ - for (entry = *tablep; entry->mode; entry++) { - if (entry->child) - sd_free_ctl_entry(&entry->child); - if (entry->proc_handler == NULL) - kfree(entry->procname); - } - - kfree(*tablep); - *tablep = NULL; -} - -static int min_load_idx = 0; -static int max_load_idx = CPU_LOAD_IDX_MAX-1; - -static void -set_table_entry(struct ctl_table *entry, - const char *procname, void *data, int maxlen, - umode_t mode, proc_handler *proc_handler, - bool load_idx) -{ - entry->procname = procname; - entry->data = data; - entry->maxlen = maxlen; - entry->mode = mode; - entry->proc_handler = proc_handler; - - if (load_idx) { - entry->extra1 = &min_load_idx; - entry->extra2 = &max_load_idx; - } -} - -static struct ctl_table * -sd_alloc_ctl_domain_table(struct sched_domain *sd) -{ - struct ctl_table *table = sd_alloc_ctl_entry(14); - - if (table == NULL) - return NULL; - - set_table_entry(&table[0], "min_interval", &sd->min_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[1], "max_interval", &sd->max_interval, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[2], "busy_idx", &sd->busy_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[3], "idle_idx", &sd->idle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[5], "wake_idx", &sd->wake_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, - sizeof(int), 0644, proc_dointvec_minmax, true); - set_table_entry(&table[7], "busy_factor", &sd->busy_factor, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[9], "cache_nice_tries", - &sd->cache_nice_tries, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[10], "flags", &sd->flags, - sizeof(int), 0644, proc_dointvec_minmax, false); - set_table_entry(&table[11], "max_newidle_lb_cost", - &sd->max_newidle_lb_cost, - sizeof(long), 0644, proc_doulongvec_minmax, false); - set_table_entry(&table[12], "name", sd->name, - CORENAME_MAX_SIZE, 0444, proc_dostring, false); - /* &table[13] is terminator */ - - return table; -} - -static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) -{ - struct ctl_table *entry, *table; - struct sched_domain *sd; - int domain_num = 0, i; - char buf[32]; - - for_each_domain(cpu, sd) - domain_num++; - entry = table = sd_alloc_ctl_entry(domain_num + 1); - if (table == NULL) - return NULL; - - i = 0; - for_each_domain(cpu, sd) { - snprintf(buf, 32, "domain%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_domain_table(sd); - entry++; - i++; - } - return table; -} - -static struct ctl_table_header *sd_sysctl_header; -static void register_sched_domain_sysctl(void) -{ - int i, cpu_num = num_possible_cpus(); - struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); - char buf[32]; - - WARN_ON(sd_ctl_dir[0].child); - sd_ctl_dir[0].child = entry; - - if (entry == NULL) - return; - - for_each_possible_cpu(i) { - snprintf(buf, 32, "cpu%d", i); - entry->procname = kstrdup(buf, GFP_KERNEL); - entry->mode = 0555; - entry->child = sd_alloc_ctl_cpu_table(i); - entry++; - } - - WARN_ON(sd_sysctl_header); - sd_sysctl_header = register_sysctl_table(sd_ctl_root); -} - -/* may be called multiple times per register */ -static void unregister_sched_domain_sysctl(void) -{ - unregister_sysctl_table(sd_sysctl_header); - sd_sysctl_header = NULL; - if (sd_ctl_dir[0].child) - sd_free_ctl_entry(&sd_ctl_dir[0].child); -} -#else -static void register_sched_domain_sysctl(void) -{ -} -static void unregister_sched_domain_sysctl(void) -{ -} -#endif /* CONFIG_SCHED_DEBUG && CONFIG_SYSCTL */ - static void set_rq_online(struct rq *rq) { if (!rq->online) { @@ -5501,6 +5389,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_UP_PREPARE: rq->calc_load_update = calc_load_update; + account_reset_rq(rq); break; case CPU_ONLINE: @@ -5566,16 +5455,6 @@ static int sched_cpu_active(struct notifier_block *nfb, set_cpu_rq_start_time(); return NOTIFY_OK; - case CPU_ONLINE: - /* - * At this point a starting CPU has marked itself as online via - * set_cpu_online(). But it might not yet have marked itself - * as active, which is essential from here on. - */ - set_cpu_active(cpu, true); - stop_machine_unpark(cpu); - return NOTIFY_OK; - case CPU_DOWN_FAILED: set_cpu_active(cpu, true); return NOTIFY_OK; @@ -5847,13 +5726,13 @@ static int init_rootdomain(struct root_domain *rd) { memset(rd, 0, sizeof(*rd)); - if (!alloc_cpumask_var(&rd->span, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->span, GFP_KERNEL)) goto out; - if (!alloc_cpumask_var(&rd->online, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->online, GFP_KERNEL)) goto free_span; - if (!alloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->dlo_mask, GFP_KERNEL)) goto free_online; - if (!alloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) + if (!zalloc_cpumask_var(&rd->rto_mask, GFP_KERNEL)) goto free_dlo_mask; init_dl_bw(&rd->dl_bw); @@ -6047,11 +5926,16 @@ cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) { + int ret; + alloc_bootmem_cpumask_var(&cpu_isolated_map); - cpulist_parse(str, cpu_isolated_map); + ret = cpulist_parse(str, cpu_isolated_map); + if (ret) { + pr_err("sched: Error, all isolcpus= values must be between 0 and %d\n", nr_cpu_ids); + return 0; + } return 1; } - __setup("isolcpus=", isolated_cpu_setup); struct s_data { @@ -6714,7 +6598,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; @@ -7331,6 +7215,9 @@ int in_sched_functions(unsigned long addr) */ struct task_group root_task_group; LIST_HEAD(task_groups); + +/* Cacheline aligned slab cache for task_group */ +static struct kmem_cache *task_group_cache __read_mostly; #endif DECLARE_PER_CPU(cpumask_var_t, load_balance_mask); @@ -7388,11 +7275,12 @@ void __init sched_init(void) #endif /* CONFIG_RT_GROUP_SCHED */ #ifdef CONFIG_CGROUP_SCHED + task_group_cache = KMEM_CACHE(task_group, 0); + list_add(&root_task_group.list, &task_groups); INIT_LIST_HEAD(&root_task_group.children); INIT_LIST_HEAD(&root_task_group.siblings); autogroup_init(&init_task); - #endif /* CONFIG_CGROUP_SCHED */ for_each_possible_cpu(i) { @@ -7668,12 +7556,12 @@ void set_curr_task(int cpu, struct task_struct *p) /* task_group_lock serializes the addition/removal of task groups */ static DEFINE_SPINLOCK(task_group_lock); -static void free_sched_group(struct task_group *tg) +static void sched_free_group(struct task_group *tg) { free_fair_sched_group(tg); free_rt_sched_group(tg); autogroup_free(tg); - kfree(tg); + kmem_cache_free(task_group_cache, tg); } /* allocate runqueue etc for a new task group */ @@ -7681,7 +7569,7 @@ struct task_group *sched_create_group(struct task_group *parent) { struct task_group *tg; - tg = kzalloc(sizeof(*tg), GFP_KERNEL); + tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); if (!tg) return ERR_PTR(-ENOMEM); @@ -7694,7 +7582,7 @@ struct task_group *sched_create_group(struct task_group *parent) return tg; err: - free_sched_group(tg); + sched_free_group(tg); return ERR_PTR(-ENOMEM); } @@ -7714,27 +7602,24 @@ void sched_online_group(struct task_group *tg, struct task_group *parent) } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group_rcu(struct rcu_head *rhp) +static void sched_free_group_rcu(struct rcu_head *rhp) { /* now it should be safe to free those cfs_rqs */ - free_sched_group(container_of(rhp, struct task_group, rcu)); + sched_free_group(container_of(rhp, struct task_group, rcu)); } -/* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group_rcu); + call_rcu(&tg->rcu, sched_free_group_rcu); } void sched_offline_group(struct task_group *tg) { unsigned long flags; - int i; /* end participation in shares distribution */ - for_each_possible_cpu(i) - unregister_fair_sched_group(tg, i); + unregister_fair_sched_group(tg); spin_lock_irqsave(&task_group_lock, flags); list_del_rcu(&tg->list); @@ -7760,7 +7645,7 @@ void sched_move_task(struct task_struct *tsk) queued = task_on_rq_queued(tsk); if (queued) - dequeue_task(rq, tsk, DEQUEUE_SAVE); + dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE); if (unlikely(running)) put_prev_task(rq, tsk); @@ -7784,7 +7669,7 @@ void sched_move_task(struct task_struct *tsk) if (unlikely(running)) tsk->sched_class->set_curr_task(rq); if (queued) - enqueue_task(rq, tsk, ENQUEUE_RESTORE); + enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE); task_rq_unlock(rq, tsk, &flags); } @@ -8185,44 +8070,39 @@ cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) if (IS_ERR(tg)) return ERR_PTR(-ENOMEM); + sched_online_group(tg, parent); + return &tg->css; } -static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) +static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); - if (parent) - sched_online_group(tg, parent); - return 0; + sched_offline_group(tg); } static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) { struct task_group *tg = css_tg(css); - sched_destroy_group(tg); -} - -static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -{ - struct task_group *tg = css_tg(css); - - sched_offline_group(tg); + /* + * Relies on the RCU grace period between css_released() and this. + */ + sched_free_group(tg); } -static void cpu_cgroup_fork(struct task_struct *task, void *private) +static void cpu_cgroup_fork(struct task_struct *task) { sched_move_task(task); } -static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) { + cgroup_taskset_for_each(task, css, tset) { #ifdef CONFIG_RT_GROUP_SCHED if (!sched_rt_can_attach(css_tg(css), task)) return -EINVAL; @@ -8235,12 +8115,12 @@ static int cpu_cgroup_can_attach(struct cgroup_subsys_state *css, return 0; } -static void cpu_cgroup_attach(struct cgroup_subsys_state *css, - struct cgroup_taskset *tset) +static void cpu_cgroup_attach(struct cgroup_taskset *tset) { struct task_struct *task; + struct cgroup_subsys_state *css; - cgroup_taskset_for_each(task, tset) + cgroup_taskset_for_each(task, css, tset) sched_move_task(task); } @@ -8569,14 +8449,13 @@ static struct cftype cpu_files[] = { struct cgroup_subsys cpu_cgrp_subsys = { .css_alloc = cpu_cgroup_css_alloc, + .css_released = cpu_cgroup_css_released, .css_free = cpu_cgroup_css_free, - .css_online = cpu_cgroup_css_online, - .css_offline = cpu_cgroup_css_offline, .fork = cpu_cgroup_fork, .can_attach = cpu_cgroup_can_attach, .attach = cpu_cgroup_attach, .legacy_cftypes = cpu_files, - .early_init = 1, + .early_init = true, }; #endif /* CONFIG_CGROUP_SCHED */ @@ -8586,3 +8465,44 @@ void dump_cpu_task(int cpu) pr_info("Task dump for CPU %d:\n", cpu); sched_show_task(cpu_curr(cpu)); } + +/* + * Nice levels are multiplicative, with a gentle 10% change for every + * nice level changed. I.e. when a CPU-bound task goes from nice 0 to + * nice 1, it will get ~10% less CPU time than another CPU-bound task + * that remained on nice 0. + * + * The "10% effect" is relative and cumulative: from _any_ nice level, + * if you go up 1 level, it's -10% CPU usage, if you go down 1 level + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) + */ +const int sched_prio_to_weight[40] = { + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, +}; + +/* + * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ +const u32 sched_prio_to_wmult[40] = { + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, +}; diff --git a/kernel/sched/cpuacct.c b/kernel/sched/cpuacct.c index dd7cbb55bbf2..4a811203c04a 100644 --- a/kernel/sched/cpuacct.c +++ b/kernel/sched/cpuacct.c @@ -145,13 +145,16 @@ static u64 cpuusage_read(struct cgroup_subsys_state *css, struct cftype *cft) } static int cpuusage_write(struct cgroup_subsys_state *css, struct cftype *cft, - u64 reset) + u64 val) { struct cpuacct *ca = css_ca(css); int err = 0; int i; - if (reset) { + /* + * Only allow '0' here to do a reset. + */ + if (val) { err = -EINVAL; goto out; } @@ -235,23 +238,10 @@ static struct cftype files[] = { void cpuacct_charge(struct task_struct *tsk, u64 cputime) { struct cpuacct *ca; - int cpu; - - cpu = task_cpu(tsk); rcu_read_lock(); - - ca = task_ca(tsk); - - while (true) { - u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu); - *cpuusage += cputime; - - ca = parent_ca(ca); - if (!ca) - break; - } - + for (ca = task_ca(tsk); ca; ca = parent_ca(ca)) + *this_cpu_ptr(ca->cpuusage) += cputime; rcu_read_unlock(); } @@ -260,18 +250,13 @@ void cpuacct_charge(struct task_struct *tsk, u64 cputime) * * Note: it's the caller that updates the account of the root cgroup. */ -void cpuacct_account_field(struct task_struct *p, int index, u64 val) +void cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { - struct kernel_cpustat *kcpustat; struct cpuacct *ca; rcu_read_lock(); - ca = task_ca(p); - while (ca != &root_cpuacct) { - kcpustat = this_cpu_ptr(ca->cpustat); - kcpustat->cpustat[index] += val; - ca = parent_ca(ca); - } + for (ca = task_ca(tsk); ca != &root_cpuacct; ca = parent_ca(ca)) + this_cpu_ptr(ca->cpustat)->cpustat[index] += val; rcu_read_unlock(); } @@ -279,5 +264,5 @@ struct cgroup_subsys cpuacct_cgrp_subsys = { .css_alloc = cpuacct_css_alloc, .css_free = cpuacct_css_free, .legacy_cftypes = files, - .early_init = 1, + .early_init = true, }; diff --git a/kernel/sched/cpuacct.h b/kernel/sched/cpuacct.h index ed605624a5e7..ba72807c73d4 100644 --- a/kernel/sched/cpuacct.h +++ b/kernel/sched/cpuacct.h @@ -1,7 +1,7 @@ #ifdef CONFIG_CGROUP_CPUACCT extern void cpuacct_charge(struct task_struct *tsk, u64 cputime); -extern void cpuacct_account_field(struct task_struct *p, int index, u64 val); +extern void cpuacct_account_field(struct task_struct *tsk, int index, u64 val); #else @@ -10,7 +10,7 @@ static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) } static inline void -cpuacct_account_field(struct task_struct *p, int index, u64 val) +cpuacct_account_field(struct task_struct *tsk, int index, u64 val) { } diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c new file mode 100644 index 000000000000..928c4ba32f68 --- /dev/null +++ b/kernel/sched/cpufreq.c @@ -0,0 +1,37 @@ +/* + * Scheduler code and data structures related to cpufreq. + * + * Copyright (C) 2016, Intel Corporation + * Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include "sched.h" + +DEFINE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_set_update_util_data - Populate the CPU's update_util_data pointer. + * @cpu: The CPU to set the pointer for. + * @data: New pointer value. + * + * Set and publish the update_util_data pointer for the given CPU. That pointer + * points to a struct update_util_data object containing a callback function + * to call from cpufreq_update_util(). That function will be called from an RCU + * read-side critical section, so it must not sleep. + * + * Callers must use RCU-sched callbacks to free any memory that might be + * accessed via the old update_util_data pointer or invoke synchronize_sched() + * right after this function to avoid use-after-free. + */ +void cpufreq_set_update_util_data(int cpu, struct update_util_data *data) +{ + if (WARN_ON(data && !data->func)) + return; + + rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), data); +} +EXPORT_SYMBOL_GPL(cpufreq_set_update_util_data); diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 26a54461bf59..75f98c5498d5 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -5,6 +5,9 @@ #include <linux/static_key.h> #include <linux/context_tracking.h> #include "sched.h" +#ifdef CONFIG_PARAVIRT +#include <asm/paravirt.h> +#endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -259,21 +262,21 @@ static __always_inline bool steal_account_process_tick(void) #ifdef CONFIG_PARAVIRT if (static_key_false(¶virt_steal_enabled)) { u64 steal; - cputime_t steal_ct; + unsigned long steal_jiffies; steal = paravirt_steal_clock(smp_processor_id()); steal -= this_rq()->prev_steal_time; /* - * cputime_t may be less precise than nsecs (eg: if it's - * based on jiffies). Lets cast the result to cputime + * steal is in nsecs but our caller is expecting steal + * time in jiffies. Lets cast the result to jiffies * granularity and account the rest on the next rounds. */ - steal_ct = nsecs_to_cputime(steal); - this_rq()->prev_steal_time += cputime_to_nsecs(steal_ct); + steal_jiffies = nsecs_to_jiffies(steal); + this_rq()->prev_steal_time += jiffies_to_nsecs(steal_jiffies); - account_steal_time(steal_ct); - return steal_ct; + account_steal_time(jiffies_to_cputime(steal_jiffies)); + return steal_jiffies; } #endif return false; @@ -466,7 +469,7 @@ void account_process_tick(struct task_struct *p, int user_tick) cputime_t one_jiffy_scaled = cputime_to_scaled(cputime_one_jiffy); struct rq *rq = this_rq(); - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) return; if (sched_clock_irqtime) { @@ -665,26 +668,25 @@ void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime #endif /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN -static unsigned long long vtime_delta(struct task_struct *tsk) +static cputime_t vtime_delta(struct task_struct *tsk) { - unsigned long long clock; + unsigned long now = READ_ONCE(jiffies); - clock = local_clock(); - if (clock < tsk->vtime_snap) + if (time_before(now, (unsigned long)tsk->vtime_snap)) return 0; - return clock - tsk->vtime_snap; + return jiffies_to_cputime(now - tsk->vtime_snap); } static cputime_t get_vtime_delta(struct task_struct *tsk) { - unsigned long long delta = vtime_delta(tsk); + unsigned long now = READ_ONCE(jiffies); + unsigned long delta = now - tsk->vtime_snap; - WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_SLEEPING); - tsk->vtime_snap += delta; + WARN_ON_ONCE(tsk->vtime_snap_whence == VTIME_INACTIVE); + tsk->vtime_snap = now; - /* CHECKME: always safe to convert nsecs to cputime? */ - return nsecs_to_cputime(delta); + return jiffies_to_cputime(delta); } static void __vtime_account_system(struct task_struct *tsk) @@ -696,37 +698,44 @@ static void __vtime_account_system(struct task_struct *tsk) void vtime_account_system(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); + if (!vtime_delta(tsk)) + return; + + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_gen_account_irq_exit(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); - __vtime_account_system(tsk); + write_seqcount_begin(&tsk->vtime_seqcount); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); if (context_tracking_in_user()) tsk->vtime_snap_whence = VTIME_USER; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_account_user(struct task_struct *tsk) { cputime_t delta_cpu; - write_seqlock(&tsk->vtime_seqlock); - delta_cpu = get_vtime_delta(tsk); + write_seqcount_begin(&tsk->vtime_seqcount); tsk->vtime_snap_whence = VTIME_SYS; - account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); - write_sequnlock(&tsk->vtime_seqlock); + if (vtime_delta(tsk)) { + delta_cpu = get_vtime_delta(tsk); + account_user_time(tsk, delta_cpu, cputime_to_scaled(delta_cpu)); + } + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_user_enter(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); - __vtime_account_system(tsk); + write_seqcount_begin(&tsk->vtime_seqcount); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); tsk->vtime_snap_whence = VTIME_USER; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } void vtime_guest_enter(struct task_struct *tsk) @@ -738,19 +747,20 @@ void vtime_guest_enter(struct task_struct *tsk) * synchronization against the reader (task_gtime()) * that can thus safely catch up with a tickless delta. */ - write_seqlock(&tsk->vtime_seqlock); - __vtime_account_system(tsk); + write_seqcount_begin(&tsk->vtime_seqcount); + if (vtime_delta(tsk)) + __vtime_account_system(tsk); current->flags |= PF_VCPU; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_enter); void vtime_guest_exit(struct task_struct *tsk) { - write_seqlock(&tsk->vtime_seqlock); + write_seqcount_begin(&tsk->vtime_seqcount); __vtime_account_system(tsk); current->flags &= ~PF_VCPU; - write_sequnlock(&tsk->vtime_seqlock); + write_seqcount_end(&tsk->vtime_seqcount); } EXPORT_SYMBOL_GPL(vtime_guest_exit); @@ -763,24 +773,26 @@ void vtime_account_idle(struct task_struct *tsk) void arch_vtime_task_switch(struct task_struct *prev) { - write_seqlock(&prev->vtime_seqlock); - prev->vtime_snap_whence = VTIME_SLEEPING; - write_sequnlock(&prev->vtime_seqlock); + write_seqcount_begin(&prev->vtime_seqcount); + prev->vtime_snap_whence = VTIME_INACTIVE; + write_seqcount_end(&prev->vtime_seqcount); - write_seqlock(¤t->vtime_seqlock); + write_seqcount_begin(¤t->vtime_seqcount); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock_cpu(smp_processor_id()); - write_sequnlock(¤t->vtime_seqlock); + current->vtime_snap = jiffies; + write_seqcount_end(¤t->vtime_seqcount); } void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; - write_seqlock_irqsave(&t->vtime_seqlock, flags); + local_irq_save(flags); + write_seqcount_begin(&t->vtime_seqcount); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock_cpu(cpu); - write_sequnlock_irqrestore(&t->vtime_seqlock, flags); + t->vtime_snap = jiffies; + write_seqcount_end(&t->vtime_seqcount); + local_irq_restore(flags); } cputime_t task_gtime(struct task_struct *t) @@ -788,14 +800,17 @@ cputime_t task_gtime(struct task_struct *t) unsigned int seq; cputime_t gtime; + if (!vtime_accounting_enabled()) + return t->gtime; + do { - seq = read_seqbegin(&t->vtime_seqlock); + seq = read_seqcount_begin(&t->vtime_seqcount); gtime = t->gtime; - if (t->flags & PF_VCPU) + if (t->vtime_snap_whence == VTIME_SYS && t->flags & PF_VCPU) gtime += vtime_delta(t); - } while (read_seqretry(&t->vtime_seqlock, seq)); + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); return gtime; } @@ -818,7 +833,7 @@ fetch_task_cputime(struct task_struct *t, *udelta = 0; *sdelta = 0; - seq = read_seqbegin(&t->vtime_seqlock); + seq = read_seqcount_begin(&t->vtime_seqcount); if (u_dst) *u_dst = *u_src; @@ -826,7 +841,7 @@ fetch_task_cputime(struct task_struct *t, *s_dst = *s_src; /* Task is sleeping, nothing to add */ - if (t->vtime_snap_whence == VTIME_SLEEPING || + if (t->vtime_snap_whence == VTIME_INACTIVE || is_idle_task(t)) continue; @@ -842,7 +857,7 @@ fetch_task_cputime(struct task_struct *t, if (t->vtime_snap_whence == VTIME_SYS) *sdelta = delta; } - } while (read_seqretry(&t->vtime_seqlock, seq)); + } while (read_seqcount_retry(&t->vtime_seqcount, seq)); } @@ -850,6 +865,14 @@ void task_cputime(struct task_struct *t, cputime_t *utime, cputime_t *stime) { cputime_t udelta, sdelta; + if (!vtime_accounting_enabled()) { + if (utime) + *utime = t->utime; + if (stime) + *stime = t->stime; + return; + } + fetch_task_cputime(t, utime, stime, &t->utime, &t->stime, &udelta, &sdelta); if (utime) @@ -863,6 +886,14 @@ void task_cputime_scaled(struct task_struct *t, { cputime_t udelta, sdelta; + if (!vtime_accounting_enabled()) { + if (utimescaled) + *utimescaled = t->utimescaled; + if (stimescaled) + *stimescaled = t->stimescaled; + return; + } + fetch_task_cputime(t, utimescaled, stimescaled, &t->utimescaled, &t->stimescaled, &udelta, &sdelta); if (utimescaled) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8b0a15e285f9..affd97ec9f65 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -176,8 +176,10 @@ static void enqueue_pushable_dl_task(struct rq *rq, struct task_struct *p) } } - if (leftmost) + if (leftmost) { dl_rq->pushable_dl_tasks_leftmost = &p->pushable_dl_tasks; + dl_rq->earliest_dl.next = p->dl.deadline; + } rb_link_node(&p->pushable_dl_tasks, parent, link); rb_insert_color(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); @@ -195,6 +197,10 @@ static void dequeue_pushable_dl_task(struct rq *rq, struct task_struct *p) next_node = rb_next(&p->pushable_dl_tasks); dl_rq->pushable_dl_tasks_leftmost = next_node; + if (next_node) { + dl_rq->earliest_dl.next = rb_entry(next_node, + struct task_struct, pushable_dl_tasks)->dl.deadline; + } } rb_erase(&p->pushable_dl_tasks, &dl_rq->pushable_dl_tasks_root); @@ -346,7 +352,15 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - WARN_ON(!dl_se->dl_new || dl_se->dl_throttled); + WARN_ON(dl_time_before(rq_clock(rq), dl_se->deadline)); + + /* + * We are racing with the deadline timer. So, do nothing because + * the deadline timer handler will take care of properly recharging + * the runtime and postponing the deadline + */ + if (dl_se->dl_throttled) + return; /* * We use the regular wall clock time to set deadlines in the @@ -355,7 +369,6 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se, */ dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; - dl_se->dl_new = 0; } /* @@ -393,6 +406,9 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, dl_se->runtime = pi_se->dl_runtime; } + if (dl_se->dl_yielded && dl_se->runtime > 0) + dl_se->runtime = 0; + /* * We keep moving the deadline away until we get some * available runtime for the entity. This ensures correct @@ -414,7 +430,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged to much\n"); + printk_deferred_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } @@ -494,15 +510,6 @@ static void update_dl_entity(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq = dl_rq_of_se(dl_se); struct rq *rq = rq_of_dl_rq(dl_rq); - /* - * The arrival of a new instance needs special treatment, i.e., - * the actual scheduling parameters have to be "renewed". - */ - if (dl_se->dl_new) { - setup_new_dl_entity(dl_se, pi_se); - return; - } - if (dl_time_before(dl_se->deadline, rq_clock(rq)) || dl_entity_overflow(dl_se, pi_se, rq_clock(rq))) { dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; @@ -599,16 +606,6 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) } /* - * This is possible if switched_from_dl() raced against a running - * callback that took the above !dl_task() path and we've since then - * switched back into SCHED_DEADLINE. - * - * There's nothing to do except drop our task reference. - */ - if (dl_se->dl_new) - goto unlock; - - /* * The task might have been boosted by someone else and might be in the * boosting/deboosting path, its not throttled. */ @@ -720,6 +717,10 @@ static void update_curr_dl(struct rq *rq) if (!dl_task(curr) || !on_dl_rq(dl_se)) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + /* * Consumed budget is computed considering the time as * observed by schedulable tasks (excluding time spent @@ -729,8 +730,11 @@ static void update_curr_dl(struct rq *rq) * approach need further study. */ delta_exec = rq_clock_task(rq) - curr->se.exec_start; - if (unlikely((s64)delta_exec <= 0)) + if (unlikely((s64)delta_exec <= 0)) { + if (unlikely(dl_se->dl_yielded)) + goto throttle; return; + } schedstat_set(curr->se.statistics.exec_max, max(curr->se.statistics.exec_max, delta_exec)); @@ -743,8 +747,10 @@ static void update_curr_dl(struct rq *rq) sched_rt_avg_update(rq, delta_exec); - dl_se->runtime -= dl_se->dl_yielded ? 0 : delta_exec; - if (dl_runtime_exceeded(dl_se)) { + dl_se->runtime -= delta_exec; + +throttle: + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { dl_se->dl_throttled = 1; __dequeue_task_dl(rq, curr, 0); if (unlikely(dl_se->dl_boosted || !start_dl_timer(curr))) @@ -782,42 +788,14 @@ static void update_curr_dl(struct rq *rq) #ifdef CONFIG_SMP -static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu); - -static inline u64 next_deadline(struct rq *rq) -{ - struct task_struct *next = pick_next_earliest_dl_task(rq, rq->cpu); - - if (next && dl_prio(next->prio)) - return next->dl.deadline; - else - return 0; -} - static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) { struct rq *rq = rq_of_dl_rq(dl_rq); if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { - /* - * If the dl_rq had no -deadline tasks, or if the new task - * has shorter deadline than the current one on dl_rq, we - * know that the previous earliest becomes our next earliest, - * as the new task becomes the earliest itself. - */ - dl_rq->earliest_dl.next = dl_rq->earliest_dl.curr; dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline, 1); - } else if (dl_rq->earliest_dl.next == 0 || - dl_time_before(deadline, dl_rq->earliest_dl.next)) { - /* - * On the other hand, if the new -deadline task has a - * a later deadline than the earliest one on dl_rq, but - * it is earlier than the next (if any), we must - * recompute the next-earliest. - */ - dl_rq->earliest_dl.next = next_deadline(rq); } } @@ -839,7 +817,6 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) entry = rb_entry(leftmost, struct sched_dl_entity, rb_node); dl_rq->earliest_dl.curr = entry->deadline; - dl_rq->earliest_dl.next = next_deadline(rq); cpudl_set(&rq->rd->cpudl, rq->cpu, entry->deadline, 1); } } @@ -940,7 +917,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, * parameters of the task might need updating. Otherwise, * we want a replenishment of its runtime. */ - if (dl_se->dl_new || flags & ENQUEUE_WAKEUP) + if (flags & ENQUEUE_WAKEUP) update_dl_entity(dl_se, pi_se); else if (flags & ENQUEUE_REPLENISH) replenish_dl_entity(dl_se, pi_se); @@ -1017,18 +994,14 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) */ static void yield_task_dl(struct rq *rq) { - struct task_struct *p = rq->curr; - /* * We make the task go to sleep until its current deadline by * forcing its runtime to zero. This way, update_curr_dl() stops * it and the bandwidth timer will wake it up and will give it * new scheduling parameters (thanks to dl_yielded=1). */ - if (p->dl.runtime > 0) { - rq->curr->dl.dl_yielded = 1; - p->dl.runtime = 0; - } + rq->curr->dl.dl_yielded = 1; + update_rq_clock(rq); update_curr_dl(rq); /* @@ -1274,28 +1247,6 @@ static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) return 0; } -/* Returns the second earliest -deadline task, NULL otherwise */ -static struct task_struct *pick_next_earliest_dl_task(struct rq *rq, int cpu) -{ - struct rb_node *next_node = rq->dl.rb_leftmost; - struct sched_dl_entity *dl_se; - struct task_struct *p = NULL; - -next_node: - next_node = rb_next(next_node); - if (next_node) { - dl_se = rb_entry(next_node, struct sched_dl_entity, rb_node); - p = dl_task_of(dl_se); - - if (pick_dl_task(rq, p, cpu)) - return p; - - goto next_node; - } - - return NULL; -} - /* * Return the earliest pushable rq's task, which is suitable to be executed * on the CPU, NULL otherwise: @@ -1767,6 +1718,9 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p) */ static void switched_to_dl(struct rq *rq, struct task_struct *p) { + if (dl_time_before(p->dl.deadline, rq_clock(rq))) + setup_new_dl_entity(&p->dl, &p->dl); + if (task_on_rq_queued(p) && rq->curr != p) { #ifdef CONFIG_SMP if (p->nr_cpus_allowed > 1 && rq->dl.overloaded) @@ -1813,8 +1767,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, */ resched_curr(rq); #endif /* CONFIG_SMP */ - } else - switched_to_dl(rq, p); + } } const struct sched_class dl_sched_class = { diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c index 641511771ae6..4fbc3bd5ff60 100644 --- a/kernel/sched/debug.c +++ b/kernel/sched/debug.c @@ -16,6 +16,7 @@ #include <linux/kallsyms.h> #include <linux/utsname.h> #include <linux/mempolicy.h> +#include <linux/debugfs.h> #include "sched.h" @@ -58,6 +59,309 @@ static unsigned long nsec_low(unsigned long long nsec) #define SPLIT_NS(x) nsec_high(x), nsec_low(x) +#define SCHED_FEAT(name, enabled) \ + #name , + +static const char * const sched_feat_names[] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static int sched_feat_show(struct seq_file *m, void *v) +{ + int i; + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (!(sysctl_sched_features & (1UL << i))) + seq_puts(m, "NO_"); + seq_printf(m, "%s ", sched_feat_names[i]); + } + seq_puts(m, "\n"); + + return 0; +} + +#ifdef HAVE_JUMP_LABEL + +#define jump_label_key__true STATIC_KEY_INIT_TRUE +#define jump_label_key__false STATIC_KEY_INIT_FALSE + +#define SCHED_FEAT(name, enabled) \ + jump_label_key__##enabled , + +struct static_key sched_feat_keys[__SCHED_FEAT_NR] = { +#include "features.h" +}; + +#undef SCHED_FEAT + +static void sched_feat_disable(int i) +{ + static_key_disable(&sched_feat_keys[i]); +} + +static void sched_feat_enable(int i) +{ + static_key_enable(&sched_feat_keys[i]); +} +#else +static void sched_feat_disable(int i) { }; +static void sched_feat_enable(int i) { }; +#endif /* HAVE_JUMP_LABEL */ + +static int sched_feat_set(char *cmp) +{ + int i; + int neg = 0; + + if (strncmp(cmp, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; i < __SCHED_FEAT_NR; i++) { + if (strcmp(cmp, sched_feat_names[i]) == 0) { + if (neg) { + sysctl_sched_features &= ~(1UL << i); + sched_feat_disable(i); + } else { + sysctl_sched_features |= (1UL << i); + sched_feat_enable(i); + } + break; + } + } + + return i; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp; + int i; + struct inode *inode; + + if (cnt > 63) + cnt = 63; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + cmp = strstrip(buf); + + /* Ensure the static_key remains in a consistent state */ + inode = file_inode(filp); + inode_lock(inode); + i = sched_feat_set(cmp); + inode_unlock(inode); + if (i == __SCHED_FEAT_NR) + return -EINVAL; + + *ppos += cnt; + + return cnt; +} + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, sched_feat_show, NULL); +} + +static const struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .write = sched_feat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#ifdef CONFIG_SMP + +#ifdef CONFIG_SYSCTL + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {} +}; + +static struct ctl_table sd_ctl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {} +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); + + return entry; +} + +static void sd_free_ctl_entry(struct ctl_table **tablep) +{ + struct ctl_table *entry; + + /* + * In the intermediate directories, both the child directory and + * procname are dynamically allocated and could fail but the mode + * will always be set. In the lowest directory the names are + * static strings and all have proc handlers. + */ + for (entry = *tablep; entry->mode; entry++) { + if (entry->child) + sd_free_ctl_entry(&entry->child); + if (entry->proc_handler == NULL) + kfree(entry->procname); + } + + kfree(*tablep); + *tablep = NULL; +} + +static int min_load_idx = 0; +static int max_load_idx = CPU_LOAD_IDX_MAX-1; + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + umode_t mode, proc_handler *proc_handler, + bool load_idx) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; + + if (load_idx) { + entry->extra1 = &min_load_idx; + entry->extra2 = &max_load_idx; + } +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + if (table == NULL) + return NULL; + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax, true); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[9], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[10], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax, false); + set_table_entry(&table[11], "max_newidle_lb_cost", + &sd->max_newidle_lb_cost, + sizeof(long), 0644, proc_doulongvec_minmax, false); + set_table_entry(&table[12], "name", sd->name, + CORENAME_MAX_SIZE, 0444, proc_dostring, false); + /* &table[13] is terminator */ + + return table; +} + +static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + if (table == NULL) + return NULL; + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +void register_sched_domain_sysctl(void) +{ + int i, cpu_num = num_possible_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + WARN_ON(sd_ctl_dir[0].child); + sd_ctl_dir[0].child = entry; + + if (entry == NULL) + return; + + for_each_possible_cpu(i) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + entry++; + } + + WARN_ON(sd_sysctl_header); + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} + +/* may be called multiple times per register */ +void unregister_sched_domain_sysctl(void) +{ + unregister_sysctl_table(sd_sysctl_header); + sd_sysctl_header = NULL; + if (sd_ctl_dir[0].child) + sd_free_ctl_entry(&sd_ctl_dir[0].child); +} +#endif /* CONFIG_SYSCTL */ +#endif /* CONFIG_SMP */ + #ifdef CONFIG_FAIR_GROUP_SCHED static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group *tg) { @@ -75,16 +379,18 @@ static void print_cfs_group_stats(struct seq_file *m, int cpu, struct task_group PN(se->vruntime); PN(se->sum_exec_runtime); #ifdef CONFIG_SCHEDSTATS - PN(se->statistics.wait_start); - PN(se->statistics.sleep_start); - PN(se->statistics.block_start); - PN(se->statistics.sleep_max); - PN(se->statistics.block_max); - PN(se->statistics.exec_max); - PN(se->statistics.slice_max); - PN(se->statistics.wait_max); - PN(se->statistics.wait_sum); - P(se->statistics.wait_count); + if (schedstat_enabled()) { + PN(se->statistics.wait_start); + PN(se->statistics.sleep_start); + PN(se->statistics.block_start); + PN(se->statistics.sleep_max); + PN(se->statistics.block_max); + PN(se->statistics.exec_max); + PN(se->statistics.slice_max); + PN(se->statistics.wait_max); + PN(se->statistics.wait_sum); + P(se->statistics.wait_count); + } #endif P(se->load.weight); #ifdef CONFIG_SMP @@ -122,10 +428,12 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) (long long)(p->nvcsw + p->nivcsw), p->prio); #ifdef CONFIG_SCHEDSTATS - SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", - SPLIT_NS(p->se.statistics.wait_sum), - SPLIT_NS(p->se.sum_exec_runtime), - SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + if (schedstat_enabled()) { + SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", + SPLIT_NS(p->se.statistics.wait_sum), + SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(p->se.statistics.sum_sleep_runtime)); + } #else SEQ_printf(m, "%9Ld.%06ld %9Ld.%06ld %9Ld.%06ld", 0LL, 0L, @@ -258,8 +566,17 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq) { + struct dl_bw *dl_bw; + SEQ_printf(m, "\ndl_rq[%d]:\n", cpu); SEQ_printf(m, " .%-30s: %ld\n", "dl_nr_running", dl_rq->dl_nr_running); +#ifdef CONFIG_SMP + dl_bw = &cpu_rq(cpu)->rd->dl_bw; +#else + dl_bw = &dl_rq->dl_bw; +#endif + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->bw", dl_bw->bw); + SEQ_printf(m, " .%-30s: %lld\n", "dl_bw->total_bw", dl_bw->total_bw); } extern __read_mostly int sched_clock_running; @@ -313,17 +630,18 @@ do { \ #define P(n) SEQ_printf(m, " .%-30s: %d\n", #n, rq->n); #define P64(n) SEQ_printf(m, " .%-30s: %Ld\n", #n, rq->n); - P(yld_count); - - P(sched_count); - P(sched_goidle); #ifdef CONFIG_SMP P64(avg_idle); P64(max_idle_balance_cost); #endif - P(ttwu_count); - P(ttwu_local); + if (schedstat_enabled()) { + P(yld_count); + P(sched_count); + P(sched_goidle); + P(ttwu_count); + P(ttwu_local); + } #undef P #undef P64 @@ -569,38 +887,39 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m) nr_switches = p->nvcsw + p->nivcsw; #ifdef CONFIG_SCHEDSTATS - PN(se.statistics.sum_sleep_runtime); - PN(se.statistics.wait_start); - PN(se.statistics.sleep_start); - PN(se.statistics.block_start); - PN(se.statistics.sleep_max); - PN(se.statistics.block_max); - PN(se.statistics.exec_max); - PN(se.statistics.slice_max); - PN(se.statistics.wait_max); - PN(se.statistics.wait_sum); - P(se.statistics.wait_count); - PN(se.statistics.iowait_sum); - P(se.statistics.iowait_count); P(se.nr_migrations); - P(se.statistics.nr_migrations_cold); - P(se.statistics.nr_failed_migrations_affine); - P(se.statistics.nr_failed_migrations_running); - P(se.statistics.nr_failed_migrations_hot); - P(se.statistics.nr_forced_migrations); - P(se.statistics.nr_wakeups); - P(se.statistics.nr_wakeups_sync); - P(se.statistics.nr_wakeups_migrate); - P(se.statistics.nr_wakeups_local); - P(se.statistics.nr_wakeups_remote); - P(se.statistics.nr_wakeups_affine); - P(se.statistics.nr_wakeups_affine_attempts); - P(se.statistics.nr_wakeups_passive); - P(se.statistics.nr_wakeups_idle); - { + if (schedstat_enabled()) { u64 avg_atom, avg_per_cpu; + PN(se.statistics.sum_sleep_runtime); + PN(se.statistics.wait_start); + PN(se.statistics.sleep_start); + PN(se.statistics.block_start); + PN(se.statistics.sleep_max); + PN(se.statistics.block_max); + PN(se.statistics.exec_max); + PN(se.statistics.slice_max); + PN(se.statistics.wait_max); + PN(se.statistics.wait_sum); + P(se.statistics.wait_count); + PN(se.statistics.iowait_sum); + P(se.statistics.iowait_count); + P(se.statistics.nr_migrations_cold); + P(se.statistics.nr_failed_migrations_affine); + P(se.statistics.nr_failed_migrations_running); + P(se.statistics.nr_failed_migrations_hot); + P(se.statistics.nr_forced_migrations); + P(se.statistics.nr_wakeups); + P(se.statistics.nr_wakeups_sync); + P(se.statistics.nr_wakeups_migrate); + P(se.statistics.nr_wakeups_local); + P(se.statistics.nr_wakeups_remote); + P(se.statistics.nr_wakeups_affine); + P(se.statistics.nr_wakeups_affine_attempts); + P(se.statistics.nr_wakeups_passive); + P(se.statistics.nr_wakeups_idle); + avg_atom = p->se.sum_exec_runtime; if (nr_switches) avg_atom = div64_ul(avg_atom, nr_switches); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 824aa9f501a3..0fe30e66aff1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -17,11 +17,11 @@ * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de> * * Adaptive scheduling granularity, math enhancements by Peter Zijlstra - * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ -#include <linux/latencytop.h> #include <linux/sched.h> +#include <linux/latencytop.h> #include <linux/cpumask.h> #include <linux/cpuidle.h> #include <linux/slab.h> @@ -738,16 +738,52 @@ static void update_curr_fair(struct rq *rq) update_curr(cfs_rq_of(&rq->curr->se)); } +#ifdef CONFIG_SCHEDSTATS static inline void update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) { - schedstat_set(se->statistics.wait_start, rq_clock(rq_of(cfs_rq))); + u64 wait_start = rq_clock(rq_of(cfs_rq)); + + if (entity_is_task(se) && task_on_rq_migrating(task_of(se)) && + likely(wait_start > se->statistics.wait_start)) + wait_start -= se->statistics.wait_start; + + se->statistics.wait_start = wait_start; +} + +static void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + struct task_struct *p; + u64 delta; + + delta = rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start; + + if (entity_is_task(se)) { + p = task_of(se); + if (task_on_rq_migrating(p)) { + /* + * Preserve migrating task's wait time so wait_start + * time stamp can be adjusted to accumulate wait time + * prior to migration. + */ + se->statistics.wait_start = delta; + return; + } + trace_sched_stat_wait(p, delta); + } + + se->statistics.wait_max = max(se->statistics.wait_max, delta); + se->statistics.wait_count++; + se->statistics.wait_sum += delta; + se->statistics.wait_start = 0; } /* * Task is being enqueued - update stats: */ -static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { /* * Are we enqueueing a waiting task? (for current tasks @@ -757,25 +793,8 @@ static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) update_stats_wait_start(cfs_rq, se); } -static void -update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max, - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start)); - schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1); - schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum + - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - trace_sched_stat_wait(task_of(se), - rq_clock(rq_of(cfs_rq)) - se->statistics.wait_start); - } -#endif - schedstat_set(se->statistics.wait_start, 0); -} - static inline void -update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { /* * Mark the end of the wait period if dequeueing a @@ -783,8 +802,41 @@ update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) */ if (se != cfs_rq->curr) update_stats_wait_end(cfs_rq, se); + + if (flags & DEQUEUE_SLEEP) { + if (entity_is_task(se)) { + struct task_struct *tsk = task_of(se); + + if (tsk->state & TASK_INTERRUPTIBLE) + se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); + if (tsk->state & TASK_UNINTERRUPTIBLE) + se->statistics.block_start = rq_clock(rq_of(cfs_rq)); + } + } + +} +#else +static inline void +update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ } +static inline void +update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ +} + +static inline void +update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +{ +} +#endif + /* * We are picking a new current task - update its stats: */ @@ -880,10 +932,11 @@ struct numa_group { spinlock_t lock; /* nr_tasks, tasks */ int nr_tasks; pid_t gid; + int active_nodes; struct rcu_head rcu; - nodemask_t active_nodes; unsigned long total_faults; + unsigned long max_faults_cpu; /* * Faults_cpu is used to decide whether memory should move * towards the CPU. As a consequence, these stats are weighted @@ -942,6 +995,18 @@ static inline unsigned long group_faults_cpu(struct numa_group *group, int nid) group->faults_cpu[task_faults_idx(NUMA_MEM, nid, 1)]; } +/* + * A node triggering more than 1/3 as many NUMA faults as the maximum is + * considered part of a numa group's pseudo-interleaving set. Migrations + * between these nodes are slowed down, to allow things to settle down. + */ +#define ACTIVE_NODE_FRACTION 3 + +static bool numa_is_active_node(int nid, struct numa_group *ng) +{ + return group_faults_cpu(ng, nid) * ACTIVE_NODE_FRACTION > ng->max_faults_cpu; +} + /* Handle placement on systems where not all nodes are directly connected. */ static unsigned long score_nearby_nodes(struct task_struct *p, int nid, int maxdist, bool task) @@ -1091,27 +1156,23 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, return true; /* - * Do not migrate if the destination is not a node that - * is actively used by this numa group. - */ - if (!node_isset(dst_nid, ng->active_nodes)) - return false; - - /* - * Source is a node that is not actively used by this - * numa group, while the destination is. Migrate. + * Destination node is much more heavily used than the source + * node? Allow migration. */ - if (!node_isset(src_nid, ng->active_nodes)) + if (group_faults_cpu(ng, dst_nid) > group_faults_cpu(ng, src_nid) * + ACTIVE_NODE_FRACTION) return true; /* - * Both source and destination are nodes in active - * use by this numa group. Maximize memory bandwidth - * by migrating from more heavily used groups, to less - * heavily used ones, spreading the load around. - * Use a 1/4 hysteresis to avoid spurious page movement. + * Distribute memory according to CPU & memory use on each node, + * with 3/4 hysteresis to avoid unnecessary memory migrations: + * + * faults_cpu(dst) 3 faults_cpu(src) + * --------------- * - > --------------- + * faults_mem(dst) 4 faults_mem(src) */ - return group_faults(p, dst_nid) < (group_faults(p, src_nid) * 3 / 4); + return group_faults_cpu(ng, dst_nid) * group_faults(p, src_nid) * 3 > + group_faults_cpu(ng, src_nid) * group_faults(p, dst_nid) * 4; } static unsigned long weighted_cpuload(const int cpu); @@ -1193,8 +1254,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1262,20 +1321,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1359,6 +1428,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1384,9 +1454,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, @@ -1441,7 +1518,7 @@ static int task_numa_migrate(struct task_struct *p) .best_task = NULL, .best_imp = 0, - .best_cpu = -1 + .best_cpu = -1, }; struct sched_domain *sd; unsigned long taskweight, groupweight; @@ -1493,8 +1570,7 @@ static int task_numa_migrate(struct task_struct *p) * multiple NUMA nodes; in order to better consolidate the group, * we need to check other locations. */ - if (env.best_cpu == -1 || (p->numa_group && - nodes_weight(p->numa_group->active_nodes) > 1)) { + if (env.best_cpu == -1 || (p->numa_group && p->numa_group->active_nodes > 1)) { for_each_online_node(nid) { if (nid == env.src_nid || nid == p->numa_preferred_nid) continue; @@ -1529,12 +1605,14 @@ static int task_numa_migrate(struct task_struct *p) * trying for a better one later. Do not set the preferred node here. */ if (p->numa_group) { + struct numa_group *ng = p->numa_group; + if (env.best_cpu == -1) nid = env.src_nid; else nid = env.dst_nid; - if (node_isset(nid, p->numa_group->active_nodes)) + if (ng->active_nodes > 1 && numa_is_active_node(env.dst_nid, ng)) sched_setnuma(p, env.dst_nid); } @@ -1584,20 +1662,15 @@ static void numa_migrate_preferred(struct task_struct *p) } /* - * Find the nodes on which the workload is actively running. We do this by + * Find out how many nodes on the workload is actively running on. Do this by * tracking the nodes from which NUMA hinting faults are triggered. This can * be different from the set of nodes where the workload's memory is currently * located. - * - * The bitmask is used to make smarter decisions on when to do NUMA page - * migrations, To prevent flip-flopping, and excessive page migrations, nodes - * are added when they cause over 6/16 of the maximum number of faults, but - * only removed when they drop below 3/16. */ -static void update_numa_active_node_mask(struct numa_group *numa_group) +static void numa_group_count_active_nodes(struct numa_group *numa_group) { unsigned long faults, max_faults = 0; - int nid; + int nid, active_nodes = 0; for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); @@ -1607,12 +1680,12 @@ static void update_numa_active_node_mask(struct numa_group *numa_group) for_each_online_node(nid) { faults = group_faults_cpu(numa_group, nid); - if (!node_isset(nid, numa_group->active_nodes)) { - if (faults > max_faults * 6 / 16) - node_set(nid, numa_group->active_nodes); - } else if (faults < max_faults * 3 / 16) - node_clear(nid, numa_group->active_nodes); + if (faults * ACTIVE_NODE_FRACTION > max_faults) + active_nodes++; } + + numa_group->max_faults_cpu = max_faults; + numa_group->active_nodes = active_nodes; } /* @@ -1903,7 +1976,7 @@ static void task_numa_placement(struct task_struct *p) update_task_scan_period(p, fault_types[0], fault_types[1]); if (p->numa_group) { - update_numa_active_node_mask(p->numa_group); + numa_group_count_active_nodes(p->numa_group); spin_unlock_irq(group_lock); max_nid = preferred_group_nid(p, max_group_nid); } @@ -1947,14 +2020,14 @@ static void task_numa_group(struct task_struct *p, int cpupid, int flags, return; atomic_set(&grp->refcount, 1); + grp->active_nodes = 1; + grp->max_faults_cpu = 0; spin_lock_init(&grp->lock); grp->gid = p->pid; /* Second half of the array tracks nids where faults happen */ grp->faults_cpu = grp->faults + NR_NUMA_HINT_FAULT_TYPES * nr_node_ids; - node_set(task_node(current), grp->active_nodes); - for (i = 0; i < NR_NUMA_HINT_FAULT_STATS * nr_node_ids; i++) grp->faults[i] = p->numa_faults[i]; @@ -2068,6 +2141,7 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) bool migrated = flags & TNF_MIGRATED; int cpu_node = task_node(current); int local = !!(flags & TNF_FAULT_LOCAL); + struct numa_group *ng; int priv; if (!static_branch_likely(&sched_numa_balancing)) @@ -2108,9 +2182,10 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) * actively using should be counted as local. This allows the * scan rate to slow down when a workload has settled down. */ - if (!priv && !local && p->numa_group && - node_isset(cpu_node, p->numa_group->active_nodes) && - node_isset(mem_node, p->numa_group->active_nodes)) + ng = p->numa_group; + if (!priv && !local && ng && ng->active_nodes > 1 && + numa_is_active_node(cpu_node, ng) && + numa_is_active_node(mem_node, ng)) local = 1; task_numa_placement(p); @@ -2155,6 +2230,7 @@ void task_numa_work(struct callback_head *work) unsigned long migrate, next_scan, now = jiffies; struct task_struct *p = current; struct mm_struct *mm = p->mm; + u64 runtime = p->se.sum_exec_runtime; struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2277,6 +2353,17 @@ out: else reset_ptenuma_scan(p); up_read(&mm->mmap_sem); + + /* + * Make sure tasks use at least 32x as much time to run other code + * than they used here, to limit NUMA PTE scanning overhead to 3% max. + * Usually update_task_scan_period slows down scanning enough; on an + * overloaded system we need to limit overhead on a per task basis. + */ + if (unlikely(p->se.sum_exec_runtime != runtime)) { + u64 diff = p->se.sum_exec_runtime - runtime; + p->node_stamp += 32 * diff; + } } /* @@ -2302,7 +2389,7 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr) now = curr->se.sum_exec_runtime; period = (u64)curr->numa_scan_period * NSEC_PER_MSEC; - if (now - curr->node_stamp > period) { + if (now > curr->node_stamp + period) { if (!curr->node_stamp) curr->numa_scan_period = task_scan_min(curr); curr->node_stamp += period; @@ -2670,12 +2757,64 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) { long delta = cfs_rq->avg.load_avg - cfs_rq->tg_load_avg_contrib; + /* + * No need to update load_avg for root_task_group as it is not used. + */ + if (cfs_rq->tg == &root_task_group) + return; + if (force || abs(delta) > cfs_rq->tg_load_avg_contrib / 64) { atomic_long_add(delta, &cfs_rq->tg->load_avg); cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg; } } +/* + * Called within set_task_rq() right before setting a task's cpu. The + * caller only guarantees p->pi_lock is held; no other assumptions, + * including the state of rq->lock, should be made. + */ +void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) +{ + if (!sched_feat(ATTACH_AGE_LOAD)) + return; + + /* + * We are supposed to update the task to "current" time, then its up to + * date and ready to go to new CPU/cfs_rq. But we have difficulty in + * getting what current time is, so simply throw away the out-of-date + * time. This will result in the wakee task is less decayed, but giving + * the wakee more load sounds not bad. + */ + if (se->avg.last_update_time && prev) { + u64 p_last_update_time; + u64 n_last_update_time; + +#ifndef CONFIG_64BIT + u64 p_last_update_time_copy; + u64 n_last_update_time_copy; + + do { + p_last_update_time_copy = prev->load_last_update_time_copy; + n_last_update_time_copy = next->load_last_update_time_copy; + + smp_rmb(); + + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; + + } while (p_last_update_time != p_last_update_time_copy || + n_last_update_time != n_last_update_time_copy); +#else + p_last_update_time = prev->avg.last_update_time; + n_last_update_time = next->avg.last_update_time; +#endif + __update_load_avg(p_last_update_time, cpu_of(rq_of(prev)), + &se->avg, 0, 0, NULL); + se->avg.last_update_time = n_last_update_time; + } +} #else /* CONFIG_FAIR_GROUP_SCHED */ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force) {} #endif /* CONFIG_FAIR_GROUP_SCHED */ @@ -2689,7 +2828,7 @@ static inline int update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq) int decayed, removed = 0; if (atomic_long_read(&cfs_rq->removed_load_avg)) { - long r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); + s64 r = atomic_long_xchg(&cfs_rq->removed_load_avg, 0); sa->load_avg = max_t(long, sa->load_avg - r, 0); sa->load_sum = max_t(s64, sa->load_sum - r * LOAD_AVG_MAX, 0); removed = 1; @@ -2717,7 +2856,8 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) { struct cfs_rq *cfs_rq = cfs_rq_of(se); u64 now = cfs_rq_clock_task(cfs_rq); - int cpu = cpu_of(rq_of(cfs_rq)); + struct rq *rq = rq_of(cfs_rq); + int cpu = cpu_of(rq); /* * Track task load average for carrying it to new CPU after migrated, and @@ -2729,6 +2869,29 @@ static inline void update_load_avg(struct sched_entity *se, int update_tg) if (update_cfs_rq_load_avg(now, cfs_rq) && update_tg) update_tg_load_avg(cfs_rq, 0); + + if (cpu == smp_processor_id() && &rq->cfs == cfs_rq) { + unsigned long max = rq->cpu_capacity_orig; + + /* + * There are a few boundary cases this might miss but it should + * get called often enough that that should (hopefully) not be + * a real problem -- added to that it only calls on the local + * CPU, so if we enqueue remotely we'll miss an update, but + * the next tick/schedule should update. + * + * It will not get called when we go idle, because the idle + * thread is a different class (!fair), nor will the utilization + * number include things like RT tasks. + * + * As is, the util number is not freq-invariant (we'd have to + * implement arch_scale_freq_capacity() for that). + * + * See cpu_util(). + */ + cpufreq_update_util(rq_clock(rq), + min(cfs_rq->avg.util_avg, max), max); + } } static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -2809,48 +2972,48 @@ dequeue_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) max_t(s64, cfs_rq->runnable_load_sum - se->avg.load_sum, 0); } -/* - * Task first catches up with cfs_rq, and then subtract - * itself from the cfs_rq (task must be off the queue now). - */ -void remove_entity_load_avg(struct sched_entity *se) -{ - struct cfs_rq *cfs_rq = cfs_rq_of(se); - u64 last_update_time; - #ifndef CONFIG_64BIT +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) +{ u64 last_update_time_copy; + u64 last_update_time; do { last_update_time_copy = cfs_rq->load_last_update_time_copy; smp_rmb(); last_update_time = cfs_rq->avg.last_update_time; } while (last_update_time != last_update_time_copy); -#else - last_update_time = cfs_rq->avg.last_update_time; -#endif - __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); - atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); - atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); + return last_update_time; } - -/* - * Update the rq's load with the elapsed running time before entering - * idle. if the last scheduled task is not a CFS task, idle_enter will - * be the only way to update the runnable statistic. - */ -void idle_enter_fair(struct rq *this_rq) +#else +static inline u64 cfs_rq_last_update_time(struct cfs_rq *cfs_rq) { + return cfs_rq->avg.last_update_time; } +#endif /* - * Update the rq's load with the elapsed idle time before a task is - * scheduled. if the newly scheduled task is not a CFS task, idle_exit will - * be the only way to update the runnable statistic. + * Task first catches up with cfs_rq, and then subtract + * itself from the cfs_rq (task must be off the queue now). */ -void idle_exit_fair(struct rq *this_rq) +void remove_entity_load_avg(struct sched_entity *se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + u64 last_update_time; + + /* + * Newly created task or never used group entity should not be removed + * from its (source) cfs_rq + */ + if (se->avg.last_update_time == 0) + return; + + last_update_time = cfs_rq_last_update_time(cfs_rq); + + __update_load_avg(last_update_time, cpu_of(rq_of(cfs_rq)), &se->avg, 0, 0, NULL); + atomic_long_add(se->avg.load_avg, &cfs_rq->removed_load_avg); + atomic_long_add(se->avg.util_avg, &cfs_rq->removed_util_avg); } static inline unsigned long cfs_rq_runnable_load_avg(struct cfs_rq *cfs_rq) @@ -2995,32 +3158,64 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) static void check_enqueue_throttle(struct cfs_rq *cfs_rq); +static inline void check_schedstat_required(void) +{ +#ifdef CONFIG_SCHEDSTATS + if (schedstat_enabled()) + return; + + /* Force schedstat enabled if a dependent tracepoint is active */ + if (trace_sched_stat_wait_enabled() || + trace_sched_stat_sleep_enabled() || + trace_sched_stat_iowait_enabled() || + trace_sched_stat_blocked_enabled() || + trace_sched_stat_runtime_enabled()) { + pr_warn_once("Scheduler tracepoints stat_sleep, stat_iowait, " + "stat_blocked and stat_runtime require the " + "kernel parameter schedstats=enabled or " + "kernel.sched_schedstats=1\n"); + } +#endif +} + static void enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) { + bool renorm = !(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING); + bool curr = cfs_rq->curr == se; + /* - * Update the normalized vruntime before updating min_vruntime - * through calling update_curr(). + * If we're the current task, we must renormalise before calling + * update_curr(). */ - if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING)) + if (renorm && curr) se->vruntime += cfs_rq->min_vruntime; + update_curr(cfs_rq); + /* - * Update run-time statistics of the 'current'. + * Otherwise, renormalise after, such that we're placed at the current + * moment in time, instead of some random moment in the past. */ - update_curr(cfs_rq); + if (renorm && !curr) + se->vruntime += cfs_rq->min_vruntime; + enqueue_entity_load_avg(cfs_rq, se); account_entity_enqueue(cfs_rq, se); update_cfs_shares(cfs_rq); if (flags & ENQUEUE_WAKEUP) { place_entity(cfs_rq, se, 0); - enqueue_sleeper(cfs_rq, se); + if (schedstat_enabled()) + enqueue_sleeper(cfs_rq, se); } - update_stats_enqueue(cfs_rq, se); - check_spread(cfs_rq, se); - if (se != cfs_rq->curr) + check_schedstat_required(); + if (schedstat_enabled()) { + update_stats_enqueue(cfs_rq, se); + check_spread(cfs_rq, se); + } + if (!curr) __enqueue_entity(cfs_rq, se); se->on_rq = 1; @@ -3086,19 +3281,8 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) update_curr(cfs_rq); dequeue_entity_load_avg(cfs_rq, se); - update_stats_dequeue(cfs_rq, se); - if (flags & DEQUEUE_SLEEP) { -#ifdef CONFIG_SCHEDSTATS - if (entity_is_task(se)) { - struct task_struct *tsk = task_of(se); - - if (tsk->state & TASK_INTERRUPTIBLE) - se->statistics.sleep_start = rq_clock(rq_of(cfs_rq)); - if (tsk->state & TASK_UNINTERRUPTIBLE) - se->statistics.block_start = rq_clock(rq_of(cfs_rq)); - } -#endif - } + if (schedstat_enabled()) + update_stats_dequeue(cfs_rq, se, flags); clear_buddies(cfs_rq, se); @@ -3172,7 +3356,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * a CPU. So account for the time it spent waiting on the * runqueue. */ - update_stats_wait_end(cfs_rq, se); + if (schedstat_enabled()) + update_stats_wait_end(cfs_rq, se); __dequeue_entity(cfs_rq, se); update_load_avg(se, 1); } @@ -3185,7 +3370,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * least twice that of our own weight (i.e. dont track it * when there are only lesser-weight tasks around): */ - if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { + if (schedstat_enabled() && rq_of(cfs_rq)->load.weight >= 2*se->load.weight) { se->statistics.slice_max = max(se->statistics.slice_max, se->sum_exec_runtime - se->prev_sum_exec_runtime); } @@ -3268,9 +3453,13 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) /* throttle cfs_rqs exceeding runtime */ check_cfs_rq_runtime(cfs_rq); - check_spread(cfs_rq, prev); + if (schedstat_enabled()) { + check_spread(cfs_rq, prev); + if (prev->on_rq) + update_stats_wait_start(cfs_rq, prev); + } + if (prev->on_rq) { - update_stats_wait_start(cfs_rq, prev); /* Put 'current' back into the tree. */ __enqueue_entity(cfs_rq, prev); /* in !on_rq case, update occurred at dequeue */ @@ -4240,42 +4429,37 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) */ /* - * The exact cpuload at various idx values, calculated at every tick would be - * load = (2^idx - 1) / 2^idx * load + 1 / 2^idx * cur_load + * The exact cpuload calculated at every tick would be: + * + * load' = (1 - 1/2^i) * load + (1/2^i) * cur_load * - * If a cpu misses updates for n-1 ticks (as it was idle) and update gets called - * on nth tick when cpu may be busy, then we have: - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * load = (2^idx - 1) / 2^idx) * load + 1 / 2^idx * cur_load + * If a cpu misses updates for n ticks (as it was idle) and update gets + * called on the n+1-th tick when cpu may be busy, then we have: + * + * load_n = (1 - 1/2^i)^n * load_0 + * load_n+1 = (1 - 1/2^i) * load_n + (1/2^i) * cur_load * * decay_load_missed() below does efficient calculation of - * load = ((2^idx - 1) / 2^idx)^(n-1) * load - * avoiding 0..n-1 loop doing load = ((2^idx - 1) / 2^idx) * load + * + * load' = (1 - 1/2^i)^n * load + * + * Because x^(n+m) := x^n * x^m we can decompose any x^n in power-of-2 factors. + * This allows us to precompute the above in said factors, thereby allowing the + * reduction of an arbitrary n in O(log_2 n) steps. (See also + * fixed_power_int()) * * The calculation is approximated on a 128 point scale. - * degrade_zero_ticks is the number of ticks after which load at any - * particular idx is approximated to be zero. - * degrade_factor is a precomputed table, a row for each load idx. - * Each column corresponds to degradation factor for a power of two ticks, - * based on 128 point scale. - * Example: - * row 2, col 3 (=12) says that the degradation at load idx 2 after - * 8 ticks is 12/128 (which is an approximation of exact factor 3^8/4^8). - * - * With this power of 2 load factors, we can degrade the load n times - * by looking at 1 bits in n and doing as many mult/shift instead of - * n mult/shifts needed by the exact degradation. */ #define DEGRADE_SHIFT 7 -static const unsigned char - degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; -static const unsigned char - degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { - {0, 0, 0, 0, 0, 0, 0, 0}, - {64, 32, 8, 0, 0, 0, 0, 0}, - {96, 72, 40, 12, 1, 0, 0}, - {112, 98, 75, 43, 15, 1, 0}, - {120, 112, 98, 76, 45, 16, 2} }; + +static const u8 degrade_zero_ticks[CPU_LOAD_IDX_MAX] = {0, 8, 32, 64, 128}; +static const u8 degrade_factor[CPU_LOAD_IDX_MAX][DEGRADE_SHIFT + 1] = { + { 0, 0, 0, 0, 0, 0, 0, 0 }, + { 64, 32, 8, 0, 0, 0, 0, 0 }, + { 96, 72, 40, 12, 1, 0, 0, 0 }, + { 112, 98, 75, 43, 15, 1, 0, 0 }, + { 120, 112, 98, 76, 45, 16, 2, 0 } +}; /* * Update cpu_load for any missed ticks, due to tickless idle. The backlog @@ -4306,14 +4490,46 @@ decay_load_missed(unsigned long load, unsigned long missed_updates, int idx) return load; } -/* +/** + * __update_cpu_load - update the rq->cpu_load[] statistics + * @this_rq: The rq to update statistics for + * @this_load: The current load + * @pending_updates: The number of missed updates + * @active: !0 for NOHZ_FULL + * * Update rq->cpu_load[] statistics. This function is usually called every - * scheduler tick (TICK_NSEC). With tickless idle this will not be called - * every tick. We fix it up based on jiffies. + * scheduler tick (TICK_NSEC). + * + * This function computes a decaying average: + * + * load[i]' = (1 - 1/2^i) * load[i] + (1/2^i) * load + * + * Because of NOHZ it might not get called on every tick which gives need for + * the @pending_updates argument. + * + * load[i]_n = (1 - 1/2^i) * load[i]_n-1 + (1/2^i) * load_n-1 + * = A * load[i]_n-1 + B ; A := (1 - 1/2^i), B := (1/2^i) * load + * = A * (A * load[i]_n-2 + B) + B + * = A * (A * (A * load[i]_n-3 + B) + B) + B + * = A^3 * load[i]_n-3 + (A^2 + A + 1) * B + * = A^n * load[i]_0 + (A^(n-1) + A^(n-2) + ... + 1) * B + * = A^n * load[i]_0 + ((1 - A^n) / (1 - A)) * B + * = (1 - 1/2^i)^n * (load[i]_0 - load) + load + * + * In the above we've assumed load_n := load, which is true for NOHZ_FULL as + * any change in load would have resulted in the tick being turned back on. + * + * For regular NOHZ, this reduces to: + * + * load[i]_n = (1 - 1/2^i)^n * load[i]_0 + * + * see decay_load_misses(). For NOHZ_FULL we get to subtract and add the extra + * term. See the @active paramter. */ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, - unsigned long pending_updates) + unsigned long pending_updates, int active) { + unsigned long tickless_load = active ? this_rq->cpu_load[0] : 0; int i, scale; this_rq->nr_load_updates++; @@ -4327,6 +4543,15 @@ static void __update_cpu_load(struct rq *this_rq, unsigned long this_load, old_load = this_rq->cpu_load[i]; old_load = decay_load_missed(old_load, pending_updates - 1, i); + if (tickless_load) { + old_load -= decay_load_missed(tickless_load, pending_updates - 1, i); + /* + * old_load can never be a negative value because a + * decayed tickless_load cannot be greater than the + * original tickless_load. + */ + old_load += tickless_load; + } new_load = this_load; /* * Round up the averaging division if load is increasing. This @@ -4349,6 +4574,25 @@ static unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_NO_HZ_COMMON +static void __update_cpu_load_nohz(struct rq *this_rq, + unsigned long curr_jiffies, + unsigned long load, + int active) +{ + unsigned long pending_updates; + + pending_updates = curr_jiffies - this_rq->last_load_update_tick; + if (pending_updates) { + this_rq->last_load_update_tick = curr_jiffies; + /* + * In the regular NOHZ case, we were idle, this means load 0. + * In the NOHZ_FULL case, we were non-idle, we should consider + * its weighted load. + */ + __update_cpu_load(this_rq, load, pending_updates, active); + } +} + /* * There is no sane way to deal with nohz on smp when using jiffies because the * cpu doing the jiffies update might drift wrt the cpu doing the jiffy reading @@ -4366,46 +4610,31 @@ static unsigned long weighted_cpuload(const int cpu) * Called from nohz_idle_balance() to update the load ratings before doing the * idle balance. */ -static void update_idle_cpu_load(struct rq *this_rq) +static void update_cpu_load_idle(struct rq *this_rq) { - unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long load = weighted_cpuload(cpu_of(this_rq)); - unsigned long pending_updates; - /* * bail if there's load or we're actually up-to-date. */ - if (load || curr_jiffies == this_rq->last_load_update_tick) + if (weighted_cpuload(cpu_of(this_rq))) return; - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - this_rq->last_load_update_tick = curr_jiffies; - - __update_cpu_load(this_rq, load, pending_updates); + __update_cpu_load_nohz(this_rq, READ_ONCE(jiffies), 0, 0); } /* * Called from tick_nohz_idle_exit() -- try and fix up the ticks we missed. */ -void update_cpu_load_nohz(void) +void update_cpu_load_nohz(int active) { struct rq *this_rq = this_rq(); unsigned long curr_jiffies = READ_ONCE(jiffies); - unsigned long pending_updates; + unsigned long load = active ? weighted_cpuload(cpu_of(this_rq)) : 0; if (curr_jiffies == this_rq->last_load_update_tick) return; raw_spin_lock(&this_rq->lock); - pending_updates = curr_jiffies - this_rq->last_load_update_tick; - if (pending_updates) { - this_rq->last_load_update_tick = curr_jiffies; - /* - * We were idle, this means load 0, the current load might be - * !0 due to remote wakeups and the sort. - */ - __update_cpu_load(this_rq, 0, pending_updates); - } + __update_cpu_load_nohz(this_rq, curr_jiffies, load, active); raw_spin_unlock(&this_rq->lock); } #endif /* CONFIG_NO_HZ */ @@ -4417,10 +4646,10 @@ void update_cpu_load_active(struct rq *this_rq) { unsigned long load = weighted_cpuload(cpu_of(this_rq)); /* - * See the mess around update_idle_cpu_load() / update_cpu_load_nohz(). + * See the mess around update_cpu_load_idle() / update_cpu_load_nohz(). */ this_rq->last_load_update_tick = jiffies; - __update_cpu_load(this_rq, load, 1); + __update_cpu_load(this_rq, load, 1, 1); } /* @@ -4850,7 +5079,19 @@ static int select_idle_sibling(struct task_struct *p, int target) return i; /* - * Otherwise, iterate the domains and find an elegible idle cpu. + * Otherwise, iterate the domains and find an eligible idle cpu. + * + * A completely idle sched group at higher domains is more + * desirable than an idle group at a lower level, because lower + * domains have smaller groups and usually share hardware + * resources which causes tasks to contend on them, e.g. x86 + * hyperthread siblings in the lowest domain (SMT) can contend + * on the shared cpu pipeline. + * + * However, while we prefer idle groups at higher domains + * finding an idle cpu at the lowest domain is still better than + * returning 'target', which we've already established, isn't + * idle. */ sd = rcu_dereference(per_cpu(sd_llc, target)); for_each_lower_domain(sd) { @@ -4860,11 +5101,16 @@ static int select_idle_sibling(struct task_struct *p, int target) tsk_cpus_allowed(p))) goto next; + /* Ensure the entire group is idle */ for_each_cpu(i, sched_group_cpus(sg)) { if (i == target || !idle_cpu(i)) goto next; } + /* + * It doesn't matter which cpu we pick, the + * whole group is idle. + */ target = cpumask_first_and(sched_group_cpus(sg), tsk_cpus_allowed(p)); goto done; @@ -5007,8 +5253,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f /* * Called immediately before a task is migrated to a new cpu; task_cpu(p) and * cfs_rq_of(p) references at time of call are still valid and identify the - * previous cpu. However, the caller only guarantees p->pi_lock is held; no - * other assumptions, including the state of rq->lock, should be made. + * previous cpu. The caller guarantees p->pi_lock or task_rq(p)->lock is held. */ static void migrate_task_rq_fair(struct task_struct *p) { @@ -5721,8 +5966,8 @@ static void detach_task(struct task_struct *p, struct lb_env *env) { lockdep_assert_held(&env->src_rq->lock); - deactivate_task(env->src_rq, p, 0); p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); set_task_cpu(p, env->dst_cpu); } @@ -5855,8 +6100,8 @@ static void attach_task(struct rq *rq, struct task_struct *p) lockdep_assert_held(&rq->lock); BUG_ON(task_rq(p) != rq); - p->on_rq = TASK_ON_RQ_QUEUED; activate_task(rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; check_preempt_curr(rq, p, 0); } @@ -6302,7 +6547,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, bool *overload) { unsigned long load; - int i; + int i, nr_running; memset(sgs, 0, sizeof(*sgs)); @@ -6319,7 +6564,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->group_util += cpu_util(i); sgs->sum_nr_running += rq->cfs.h_nr_running; - if (rq->nr_running > 1) + nr_running = rq->nr_running; + if (nr_running > 1) *overload = true; #ifdef CONFIG_NUMA_BALANCING @@ -6327,7 +6573,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, sgs->nr_preferred_running += rq->nr_preferred_running; #endif sgs->sum_weighted_load += weighted_cpuload(i); - if (idle_cpu(i)) + /* + * No need to call idle_cpu() if nr_running is not 0 + */ + if (!nr_running && idle_cpu(i)) sgs->idle_cpus++; } @@ -7248,8 +7497,6 @@ static int idle_balance(struct rq *this_rq) int pulled_task = 0; u64 curr_cost = 0; - idle_enter_fair(this_rq); - /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. @@ -7330,10 +7577,8 @@ out: if (this_rq->nr_running != this_rq->cfs.h_nr_running) pulled_task = -1; - if (pulled_task) { - idle_exit_fair(this_rq); + if (pulled_task) this_rq->idle_stamp = 0; - } return pulled_task; } @@ -7712,7 +7957,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) if (time_after_eq(jiffies, rq->next_balance)) { raw_spin_lock_irq(&rq->lock); update_rq_clock(rq); - update_idle_cpu_load(rq); + update_cpu_load_idle(rq); raw_spin_unlock_irq(&rq->lock); rebalance_domains(rq, CPU_IDLE); } @@ -8098,11 +8343,8 @@ void free_fair_sched_group(struct task_group *tg) for_each_possible_cpu(i) { if (tg->cfs_rq) kfree(tg->cfs_rq[i]); - if (tg->se) { - if (tg->se[i]) - remove_entity_load_avg(tg->se[i]); + if (tg->se) kfree(tg->se[i]); - } } kfree(tg->cfs_rq); @@ -8150,21 +8392,29 @@ err: return 0; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) +void unregister_fair_sched_group(struct task_group *tg) { - struct rq *rq = cpu_rq(cpu); unsigned long flags; + struct rq *rq; + int cpu; - /* - * Only empty task groups can be destroyed; so we can speculatively - * check on_list without danger of it being re-added. - */ - if (!tg->cfs_rq[cpu]->on_list) - return; + for_each_possible_cpu(cpu) { + if (tg->se[cpu]) + remove_entity_load_avg(tg->se[cpu]); - raw_spin_lock_irqsave(&rq->lock, flags); - list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); - raw_spin_unlock_irqrestore(&rq->lock, flags); + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ + if (!tg->cfs_rq[cpu]->on_list) + continue; + + rq = cpu_rq(cpu); + + raw_spin_lock_irqsave(&rq->lock, flags); + list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } } void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, @@ -8246,7 +8496,7 @@ int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent) return 1; } -void unregister_fair_sched_group(struct task_group *tg, int cpu) { } +void unregister_fair_sched_group(struct task_group *tg) { } #endif /* CONFIG_FAIR_GROUP_SCHED */ diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 4a2ef5a02fd3..bd12c6c714ec 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -4,6 +4,7 @@ #include <linux/sched.h> #include <linux/cpu.h> #include <linux/cpuidle.h> +#include <linux/cpuhotplug.h> #include <linux/tick.h> #include <linux/mm.h> #include <linux/stackprotector.h> @@ -97,12 +98,6 @@ void default_idle_call(void) static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev, int next_state) { - /* Fall back to the default arch idle method on errors. */ - if (next_state < 0) { - default_idle_call(); - return next_state; - } - /* * The idle task must be scheduled, it is pointless to go to idle, just * update no idle residency and return. @@ -168,7 +163,7 @@ static void cpuidle_idle_call(void) */ if (idle_should_freeze()) { entered_state = cpuidle_enter_freeze(drv, dev); - if (entered_state >= 0) { + if (entered_state > 0) { local_irq_enable(); goto exit_idle; } @@ -199,8 +194,6 @@ exit_idle: rcu_idle_exit(); } -DEFINE_PER_CPU(bool, cpu_dead_idle); - /* * Generic idle loop implementation * @@ -219,6 +212,7 @@ static void cpu_idle_loop(void) */ __current_set_polling(); + quiet_vmstat(); tick_nohz_idle_enter(); while (!need_resched()) { @@ -226,10 +220,7 @@ static void cpu_idle_loop(void) rmb(); if (cpu_is_offline(smp_processor_id())) { - rcu_cpu_notify(NULL, CPU_DYING_IDLE, - (void *)(long)smp_processor_id()); - smp_mb(); /* all activity before dead. */ - this_cpu_write(cpu_dead_idle, true); + cpuhp_report_idle_dead(); arch_cpu_idle_dead(); } @@ -296,5 +287,6 @@ void cpu_startup_entry(enum cpuhp_state state) boot_init_stack_canary(); #endif arch_cpu_idle_prepare(); + cpuhp_online_idle(state); cpu_idle_loop(); } diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c index c4ae0f1fdf9b..47ce94931f1b 100644 --- a/kernel/sched/idle_task.c +++ b/kernel/sched/idle_task.c @@ -47,7 +47,6 @@ dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) { - idle_exit_fair(rq); rq_last_tick_reset(rq); } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index e3cc16312046..c41ea7ac1764 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -58,13 +58,21 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) raw_spin_lock(&rt_b->rt_runtime_lock); if (!rt_b->rt_period_active) { rt_b->rt_period_active = 1; - hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period); + /* + * SCHED_DEADLINE updates the bandwidth, as a run away + * RT task with a DL task could hog a CPU. But DL does + * not reset the period. If a deadline task was running + * without an RT task running, it can cause RT tasks to + * throttle when they start up. Kick the timer right away + * to update the period. + */ + hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0)); hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED); } raw_spin_unlock(&rt_b->rt_runtime_lock); } -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) && defined(HAVE_RT_PUSH_IPI) static void push_irq_work_func(struct irq_work *work); #endif @@ -436,7 +444,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq); static inline int on_rt_rq(struct sched_rt_entity *rt_se) { - return !list_empty(&rt_se->run_list); + return rt_se->on_rq; } #ifdef CONFIG_RT_GROUP_SCHED @@ -482,8 +490,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) return rt_se->my_q; } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head); -static void dequeue_rt_entity(struct sched_rt_entity *rt_se); +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags); static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) { @@ -499,7 +507,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq) if (!rt_se) enqueue_top_rt_rq(rt_rq); else if (!on_rt_rq(rt_se)) - enqueue_rt_entity(rt_se, false); + enqueue_rt_entity(rt_se, 0); if (rt_rq->highest_prio.curr < curr->prio) resched_curr(rq); @@ -516,7 +524,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq) if (!rt_se) dequeue_top_rt_rq(rt_rq); else if (on_rt_rq(rt_se)) - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, 0); } static inline int rt_rq_throttled(struct rt_rq *rt_rq) @@ -945,6 +953,10 @@ static void update_curr_rt(struct rq *rq) if (curr->sched_class != &rt_sched_class) return; + /* Kick cpufreq (see the comment in linux/cpufreq.h). */ + if (cpu_of(rq) == smp_processor_id()) + cpufreq_trigger_update(rq_clock(rq)); + delta_exec = rq_clock_task(rq) - curr->se.exec_start; if (unlikely((s64)delta_exec <= 0)) return; @@ -1142,12 +1154,27 @@ unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se) } static inline +unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se) +{ + struct rt_rq *group_rq = group_rt_rq(rt_se); + struct task_struct *tsk; + + if (group_rq) + return group_rq->rr_nr_running; + + tsk = rt_task_of(rt_se); + + return (tsk->policy == SCHED_RR) ? 1 : 0; +} + +static inline void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) { int prio = rt_se_prio(rt_se); WARN_ON(!rt_prio(prio)); rt_rq->rt_nr_running += rt_se_nr_running(rt_se); + rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se); inc_rt_prio(rt_rq, prio); inc_rt_migration(rt_se, rt_rq); @@ -1160,13 +1187,37 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) WARN_ON(!rt_prio(rt_se_prio(rt_se))); WARN_ON(!rt_rq->rt_nr_running); rt_rq->rt_nr_running -= rt_se_nr_running(rt_se); + rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se); dec_rt_prio(rt_rq, rt_se_prio(rt_se)); dec_rt_migration(rt_se, rt_rq); dec_rt_group(rt_se, rt_rq); } -static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +/* + * Change rt_se->run_list location unless SAVE && !MOVE + * + * assumes ENQUEUE/DEQUEUE flags match + */ +static inline bool move_entity(unsigned int flags) +{ + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE) + return false; + + return true; +} + +static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array) +{ + list_del_init(&rt_se->run_list); + + if (list_empty(array->queue + rt_se_prio(rt_se))) + __clear_bit(rt_se_prio(rt_se), array->bitmap); + + rt_se->on_list = 0; +} + +static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; @@ -1179,26 +1230,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) * get throttled and the current group doesn't have any other * active members. */ - if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) + if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) { + if (rt_se->on_list) + __delist_rt_entity(rt_se, array); return; + } - if (head) - list_add(&rt_se->run_list, queue); - else - list_add_tail(&rt_se->run_list, queue); - __set_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(rt_se->on_list); + if (flags & ENQUEUE_HEAD) + list_add(&rt_se->run_list, queue); + else + list_add_tail(&rt_se->run_list, queue); + + __set_bit(rt_se_prio(rt_se), array->bitmap); + rt_se->on_list = 1; + } + rt_se->on_rq = 1; inc_rt_tasks(rt_se, rt_rq); } -static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rt_rq *rt_rq = rt_rq_of_se(rt_se); struct rt_prio_array *array = &rt_rq->active; - list_del_init(&rt_se->run_list); - if (list_empty(array->queue + rt_se_prio(rt_se))) - __clear_bit(rt_se_prio(rt_se), array->bitmap); + if (move_entity(flags)) { + WARN_ON_ONCE(!rt_se->on_list); + __delist_rt_entity(rt_se, array); + } + rt_se->on_rq = 0; dec_rt_tasks(rt_se, rt_rq); } @@ -1207,7 +1269,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se) * Because the prio of an upper entry depends on the lower * entries, we must remove entries top - down. */ -static void dequeue_rt_stack(struct sched_rt_entity *rt_se) +static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags) { struct sched_rt_entity *back = NULL; @@ -1220,31 +1282,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se) for (rt_se = back; rt_se; rt_se = rt_se->back) { if (on_rt_rq(rt_se)) - __dequeue_rt_entity(rt_se); + __dequeue_rt_entity(rt_se, flags); } } -static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head) +static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) - __enqueue_rt_entity(rt_se, head); + __enqueue_rt_entity(rt_se, flags); enqueue_top_rt_rq(&rq->rt); } -static void dequeue_rt_entity(struct sched_rt_entity *rt_se) +static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags) { struct rq *rq = rq_of_rt_se(rt_se); - dequeue_rt_stack(rt_se); + dequeue_rt_stack(rt_se, flags); for_each_sched_rt_entity(rt_se) { struct rt_rq *rt_rq = group_rt_rq(rt_se); if (rt_rq && rt_rq->rt_nr_running) - __enqueue_rt_entity(rt_se, false); + __enqueue_rt_entity(rt_se, flags); } enqueue_top_rt_rq(&rq->rt); } @@ -1260,7 +1322,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) if (flags & ENQUEUE_WAKEUP) rt_se->timeout = 0; - enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD); + enqueue_rt_entity(rt_se, flags); if (!task_current(rq, p) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); @@ -1271,7 +1333,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) struct sched_rt_entity *rt_se = &p->rt; update_curr_rt(rq); - dequeue_rt_entity(rt_se); + dequeue_rt_entity(rt_se, flags); dequeue_pushable_task(rq, p); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index efd3bfc7e347..ec2e8d23527e 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -3,6 +3,7 @@ #include <linux/sched/sysctl.h> #include <linux/sched/rt.h> #include <linux/sched/deadline.h> +#include <linux/binfmts.h> #include <linux/mutex.h> #include <linux/spinlock.h> #include <linux/stop_machine.h> @@ -248,7 +249,12 @@ struct task_group { unsigned long shares; #ifdef CONFIG_SMP - atomic_long_t load_avg; + /* + * load_avg can be heavily contended at clock tick time, so put + * it in its own cacheline separated from the fields above which + * will also be accessed at each tick. + */ + atomic_long_t load_avg ____cacheline_aligned; #endif #endif @@ -308,12 +314,11 @@ extern int tg_nop(struct task_group *tg, void *data); extern void free_fair_sched_group(struct task_group *tg); extern int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent); -extern void unregister_fair_sched_group(struct task_group *tg, int cpu); +extern void unregister_fair_sched_group(struct task_group *tg); extern void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, struct sched_entity *se, int cpu, struct sched_entity *parent); extern void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b); -extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); extern void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b); extern void start_cfs_bandwidth(struct cfs_bandwidth *cfs_b); @@ -335,7 +340,15 @@ extern void sched_move_task(struct task_struct *tsk); #ifdef CONFIG_FAIR_GROUP_SCHED extern int sched_group_set_shares(struct task_group *tg, unsigned long shares); -#endif + +#ifdef CONFIG_SMP +extern void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next); +#else /* !CONFIG_SMP */ +static inline void set_task_rq_fair(struct sched_entity *se, + struct cfs_rq *prev, struct cfs_rq *next) { } +#endif /* CONFIG_SMP */ +#endif /* CONFIG_FAIR_GROUP_SCHED */ #else /* CONFIG_CGROUP_SCHED */ @@ -437,6 +450,7 @@ static inline int rt_bandwidth_enabled(void) struct rt_rq { struct rt_prio_array active; unsigned int rt_nr_running; + unsigned int rr_nr_running; #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED struct { int curr; /* highest queued rt task prio */ @@ -896,6 +910,18 @@ static inline unsigned int group_first_cpu(struct sched_group *group) extern int group_balance_cpu(struct sched_group *sg); +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) +void register_sched_domain_sysctl(void); +void unregister_sched_domain_sysctl(void); +#else +static inline void register_sched_domain_sysctl(void) +{ +} +static inline void unregister_sched_domain_sysctl(void) +{ +} +#endif + #else static inline void sched_ttwu_pending(void) { } @@ -933,6 +959,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif #ifdef CONFIG_FAIR_GROUP_SCHED + set_task_rq_fair(&p->se, p->se.cfs_rq, tg->cfs_rq[cpu]); p->se.cfs_rq = tg->cfs_rq[cpu]; p->se.parent = tg->se[cpu]; #endif @@ -1008,6 +1035,7 @@ extern struct static_key sched_feat_keys[__SCHED_FEAT_NR]; #endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */ extern struct static_key_false sched_numa_balancing; +extern struct static_key_false sched_schedstats; static inline u64 global_rt_period(void) { @@ -1073,7 +1101,10 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) * We must ensure this doesn't happen until the switch is completely * finished. * - * Pairs with the control dependency and rmb in try_to_wake_up(). + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_acquire() in try_to_wake_up(). */ smp_store_release(&prev->on_cpu, 0); #endif @@ -1110,59 +1141,43 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) #define WEIGHT_IDLEPRIO 3 #define WMULT_IDLEPRIO 1431655765 -/* - * Nice levels are multiplicative, with a gentle 10% change for every - * nice level changed. I.e. when a CPU-bound task goes from nice 0 to - * nice 1, it will get ~10% less CPU time than another CPU-bound task - * that remained on nice 0. - * - * The "10% effect" is relative and cumulative: from _any_ nice level, - * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. - * If a task goes up by ~10% and another task goes down by ~10% then - * the relative distance between them is ~25%.) - */ -static const int prio_to_weight[40] = { - /* -20 */ 88761, 71755, 56483, 46273, 36291, - /* -15 */ 29154, 23254, 18705, 14949, 11916, - /* -10 */ 9548, 7620, 6100, 4904, 3906, - /* -5 */ 3121, 2501, 1991, 1586, 1277, - /* 0 */ 1024, 820, 655, 526, 423, - /* 5 */ 335, 272, 215, 172, 137, - /* 10 */ 110, 87, 70, 56, 45, - /* 15 */ 36, 29, 23, 18, 15, -}; +extern const int sched_prio_to_weight[40]; +extern const u32 sched_prio_to_wmult[40]; /* - * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * {de,en}queue flags: + * + * DEQUEUE_SLEEP - task is no longer runnable + * ENQUEUE_WAKEUP - task just became runnable + * + * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks + * are in a known state which allows modification. Such pairs + * should preserve as much state as possible. + * + * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location + * in the runqueue. + * + * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) + * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) + * ENQUEUE_WAKING - sched_class::task_waking was called * - * In cases where the weight does not change often, we can use the - * precalculated inverse to speed up arithmetics by turning divisions - * into multiplications: */ -static const u32 prio_to_wmult[40] = { - /* -20 */ 48388, 59856, 76040, 92818, 118348, - /* -15 */ 147320, 184698, 229616, 287308, 360437, - /* -10 */ 449829, 563644, 704093, 875809, 1099582, - /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, - /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, - /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, - /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, - /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, -}; + +#define DEQUEUE_SLEEP 0x01 +#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ +#define DEQUEUE_MOVE 0x04 /* matches ENQUEUE_MOVE */ #define ENQUEUE_WAKEUP 0x01 -#define ENQUEUE_HEAD 0x02 +#define ENQUEUE_RESTORE 0x02 +#define ENQUEUE_MOVE 0x04 + +#define ENQUEUE_HEAD 0x08 +#define ENQUEUE_REPLENISH 0x10 #ifdef CONFIG_SMP -#define ENQUEUE_WAKING 0x04 /* sched_class::task_waking was called */ +#define ENQUEUE_WAKING 0x20 #else #define ENQUEUE_WAKING 0x00 #endif -#define ENQUEUE_REPLENISH 0x08 -#define ENQUEUE_RESTORE 0x10 - -#define DEQUEUE_SLEEP 0x01 -#define DEQUEUE_SAVE 0x02 #define RETRY_TASK ((void *)-1UL) @@ -1249,16 +1264,8 @@ extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void idle_enter_fair(struct rq *this_rq); -extern void idle_exit_fair(struct rq *this_rq); - extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); -#else - -static inline void idle_enter_fair(struct rq *rq) { } -static inline void idle_exit_fair(struct rq *rq) { } - #endif #ifdef CONFIG_CPU_IDLE @@ -1307,6 +1314,35 @@ unsigned long to_ratio(u64 period, u64 runtime); extern void init_entity_runnable_average(struct sched_entity *se); +#ifdef CONFIG_NO_HZ_FULL +extern bool sched_can_stop_tick(struct rq *rq); + +/* + * Tick may be needed by tasks in the runqueue depending on their policy and + * requirements. If tick is needed, lets send the target an IPI to kick it out of + * nohz mode if necessary. + */ +static inline void sched_update_tick_dependency(struct rq *rq) +{ + int cpu; + + if (!tick_nohz_full_enabled()) + return; + + cpu = cpu_of(rq); + + if (!tick_nohz_full_cpu(cpu)) + return; + + if (sched_can_stop_tick(rq)) + tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); + else + tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); +} +#else +static inline void sched_update_tick_dependency(struct rq *rq) { } +#endif + static inline void add_nr_running(struct rq *rq, unsigned count) { unsigned prev_nr = rq->nr_running; @@ -1318,26 +1354,16 @@ static inline void add_nr_running(struct rq *rq, unsigned count) if (!rq->rd->overload) rq->rd->overload = true; #endif - -#ifdef CONFIG_NO_HZ_FULL - if (tick_nohz_full_cpu(rq->cpu)) { - /* - * Tick is needed if more than one task runs on a CPU. - * Send the target an IPI to kick it out of nohz mode. - * - * We assume that IPI implies full memory barrier and the - * new value of rq->nr_running is visible on reception - * from the target. - */ - tick_nohz_full_kick_cpu(rq->cpu); - } -#endif } + + sched_update_tick_dependency(rq); } static inline void sub_nr_running(struct rq *rq, unsigned count) { rq->nr_running -= count; + /* Check if we still need preemption */ + sched_update_tick_dependency(rq); } static inline void rq_last_tick_reset(struct rq *rq) @@ -1767,3 +1793,64 @@ static inline u64 irq_time_read(int cpu) } #endif /* CONFIG_64BIT */ #endif /* CONFIG_IRQ_TIME_ACCOUNTING */ + +#ifdef CONFIG_CPU_FREQ +DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); + +/** + * cpufreq_update_util - Take a note about CPU utilization changes. + * @time: Current time. + * @util: Current utilization. + * @max: Utilization ceiling. + * + * This function is called by the scheduler on every invocation of + * update_load_avg() on the CPU whose utilization is being updated. + * + * It can only be called from RCU-sched read-side critical sections. + */ +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) +{ + struct update_util_data *data; + + data = rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)); + if (data) + data->func(data, time, util, max); +} + +/** + * cpufreq_trigger_update - Trigger CPU performance state evaluation if needed. + * @time: Current time. + * + * The way cpufreq is currently arranged requires it to evaluate the CPU + * performance state (frequency/voltage) on a regular basis to prevent it from + * being stuck in a completely inadequate performance level for too long. + * That is not guaranteed to happen if the updates are only triggered from CFS, + * though, because they may not be coming in if RT or deadline tasks are active + * all the time (or there are RT and DL tasks only). + * + * As a workaround for that issue, this function is called by the RT and DL + * sched classes to trigger extra cpufreq updates to prevent it from stalling, + * but that really is a band-aid. Going forward it should be replaced with + * solutions targeted more specifically at RT and DL tasks. + */ +static inline void cpufreq_trigger_update(u64 time) +{ + cpufreq_update_util(time, ULONG_MAX, 0); +} +#else +static inline void cpufreq_update_util(u64 time, unsigned long util, unsigned long max) {} +static inline void cpufreq_trigger_update(u64 time) {} +#endif /* CONFIG_CPU_FREQ */ + +static inline void account_reset_rq(struct rq *rq) +{ +#ifdef CONFIG_IRQ_TIME_ACCOUNTING + rq->prev_irq_time = 0; +#endif +#ifdef CONFIG_PARAVIRT + rq->prev_steal_time = 0; +#endif +#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING + rq->prev_steal_time_rq = 0; +#endif +} diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h index b0fbc7632de5..70b3b6a20fb0 100644 --- a/kernel/sched/stats.h +++ b/kernel/sched/stats.h @@ -29,9 +29,10 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) if (rq) rq->rq_sched_info.run_delay += delta; } -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -# define schedstat_set(var, val) do { var = (val); } while (0) +# define schedstat_enabled() static_branch_unlikely(&sched_schedstats) +# define schedstat_inc(rq, field) do { if (schedstat_enabled()) { (rq)->field++; } } while (0) +# define schedstat_add(rq, field, amt) do { if (schedstat_enabled()) { (rq)->field += (amt); } } while (0) +# define schedstat_set(var, val) do { if (schedstat_enabled()) { var = (val); } } while (0) #else /* !CONFIG_SCHEDSTATS */ static inline void rq_sched_info_arrive(struct rq *rq, unsigned long long delta) @@ -42,6 +43,7 @@ rq_sched_info_dequeued(struct rq *rq, unsigned long long delta) static inline void rq_sched_info_depart(struct rq *rq, unsigned long long delta) {} +# define schedstat_enabled() 0 # define schedstat_inc(rq, field) do { } while (0) # define schedstat_add(rq, field, amt) do { } while (0) # define schedstat_set(var, val) do { } while (0) diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c new file mode 100644 index 000000000000..82f0dff90030 --- /dev/null +++ b/kernel/sched/swait.c @@ -0,0 +1,123 @@ +#include <linux/sched.h> +#include <linux/swait.h> + +void __init_swait_queue_head(struct swait_queue_head *q, const char *name, + struct lock_class_key *key) +{ + raw_spin_lock_init(&q->lock); + lockdep_set_class_and_name(&q->lock, key, name); + INIT_LIST_HEAD(&q->task_list); +} +EXPORT_SYMBOL(__init_swait_queue_head); + +/* + * The thing about the wake_up_state() return value; I think we can ignore it. + * + * If for some reason it would return 0, that means the previously waiting + * task is already running, so it will observe condition true (or has already). + */ +void swake_up_locked(struct swait_queue_head *q) +{ + struct swait_queue *curr; + + if (list_empty(&q->task_list)) + return; + + curr = list_first_entry(&q->task_list, typeof(*curr), task_list); + wake_up_process(curr->task); + list_del_init(&curr->task_list); +} +EXPORT_SYMBOL(swake_up_locked); + +void swake_up(struct swait_queue_head *q) +{ + unsigned long flags; + + if (!swait_active(q)) + return; + + raw_spin_lock_irqsave(&q->lock, flags); + swake_up_locked(q); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(swake_up); + +/* + * Does not allow usage from IRQ disabled, since we must be able to + * release IRQs to guarantee bounded hold time. + */ +void swake_up_all(struct swait_queue_head *q) +{ + struct swait_queue *curr; + LIST_HEAD(tmp); + + if (!swait_active(q)) + return; + + raw_spin_lock_irq(&q->lock); + list_splice_init(&q->task_list, &tmp); + while (!list_empty(&tmp)) { + curr = list_first_entry(&tmp, typeof(*curr), task_list); + + wake_up_state(curr->task, TASK_NORMAL); + list_del_init(&curr->task_list); + + if (list_empty(&tmp)) + break; + + raw_spin_unlock_irq(&q->lock); + raw_spin_lock_irq(&q->lock); + } + raw_spin_unlock_irq(&q->lock); +} +EXPORT_SYMBOL(swake_up_all); + +void __prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + wait->task = current; + if (list_empty(&wait->task_list)) + list_add(&wait->task_list, &q->task_list); +} + +void prepare_to_swait(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + unsigned long flags; + + raw_spin_lock_irqsave(&q->lock, flags); + __prepare_to_swait(q, wait); + set_current_state(state); + raw_spin_unlock_irqrestore(&q->lock, flags); +} +EXPORT_SYMBOL(prepare_to_swait); + +long prepare_to_swait_event(struct swait_queue_head *q, struct swait_queue *wait, int state) +{ + if (signal_pending_state(state, current)) + return -ERESTARTSYS; + + prepare_to_swait(q, wait, state); + + return 0; +} +EXPORT_SYMBOL(prepare_to_swait_event); + +void __finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + __set_current_state(TASK_RUNNING); + if (!list_empty(&wait->task_list)) + list_del_init(&wait->task_list); +} + +void finish_swait(struct swait_queue_head *q, struct swait_queue *wait) +{ + unsigned long flags; + + __set_current_state(TASK_RUNNING); + + if (!list_empty_careful(&wait->task_list)) { + raw_spin_lock_irqsave(&q->lock, flags); + list_del_init(&wait->task_list); + raw_spin_unlock_irqrestore(&q->lock, flags); + } +} +EXPORT_SYMBOL(finish_swait); diff --git a/kernel/sched/wait.c b/kernel/sched/wait.c index 052e02672d12..f15d6b6a538a 100644 --- a/kernel/sched/wait.c +++ b/kernel/sched/wait.c @@ -392,7 +392,7 @@ __wait_on_bit(wait_queue_head_t *wq, struct wait_bit_queue *q, do { prepare_to_wait(wq, &q->wait, mode); if (test_bit(q->key.bit_nr, q->key.flags)) - ret = (*action)(&q->key); + ret = (*action)(&q->key, mode); } while (test_bit(q->key.bit_nr, q->key.flags) && !ret); finish_wait(wq, &q->wait); return ret; @@ -431,7 +431,7 @@ __wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q, prepare_to_wait_exclusive(wq, &q->wait, mode); if (!test_bit(q->key.bit_nr, q->key.flags)) continue; - ret = action(&q->key); + ret = action(&q->key, mode); if (!ret) continue; abort_exclusive_wait(wq, &q->wait, mode, &q->key); @@ -581,44 +581,44 @@ void wake_up_atomic_t(atomic_t *p) } EXPORT_SYMBOL(wake_up_atomic_t); -__sched int bit_wait(struct wait_bit_key *word) +__sched int bit_wait(struct wait_bit_key *word, int mode) { - if (signal_pending_state(current->state, current)) - return 1; schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait); -__sched int bit_wait_io(struct wait_bit_key *word) +__sched int bit_wait_io(struct wait_bit_key *word, int mode) { - if (signal_pending_state(current->state, current)) - return 1; io_schedule(); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL(bit_wait_io); -__sched int bit_wait_timeout(struct wait_bit_key *word) +__sched int bit_wait_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; if (time_after_eq(now, word->timeout)) return -EAGAIN; schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_timeout); -__sched int bit_wait_io_timeout(struct wait_bit_key *word) +__sched int bit_wait_io_timeout(struct wait_bit_key *word, int mode) { unsigned long now = READ_ONCE(jiffies); - if (signal_pending_state(current->state, current)) - return 1; if (time_after_eq(now, word->timeout)) return -EAGAIN; io_schedule_timeout(word->timeout - now); + if (signal_pending_state(mode, current)) + return -EINTR; return 0; } EXPORT_SYMBOL_GPL(bit_wait_io_timeout); diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 580ac2d4024f..e1e5a354854e 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -316,24 +316,24 @@ static inline void seccomp_sync_threads(void) put_seccomp_filter(thread); smp_store_release(&thread->seccomp.filter, caller->seccomp.filter); + + /* + * Don't let an unprivileged task work around + * the no_new_privs restriction by creating + * a thread that sets it up, enters seccomp, + * then dies. + */ + if (task_no_new_privs(caller)) + task_set_no_new_privs(thread); + /* * Opt the other thread into seccomp if needed. * As threads are considered to be trust-realm * equivalent (see ptrace_may_access), it is safe to * allow one thread to transition the other. */ - if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) { - /* - * Don't let an unprivileged task work around - * the no_new_privs restriction by creating - * a thread that sets it up, enters seccomp, - * then dies. - */ - if (task_no_new_privs(caller)) - task_set_no_new_privs(thread); - + if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); - } } } @@ -395,7 +395,7 @@ seccomp_prepare_user_filter(const char __user *user_filter) struct seccomp_filter *filter = ERR_PTR(-EFAULT); #ifdef CONFIG_COMPAT - if (is_compat_task()) { + if (in_compat_syscall()) { struct compat_sock_fprog fprog32; if (copy_from_user(&fprog32, user_filter, sizeof(fprog32))) goto out; @@ -529,7 +529,7 @@ static void __secure_computing_strict(int this_syscall) { int *syscall_whitelist = mode1_syscalls; #ifdef CONFIG_COMPAT - if (is_compat_task()) + if (in_compat_syscall()) syscall_whitelist = mode1_syscalls_32; #endif do { diff --git a/kernel/signal.c b/kernel/signal.c index 0f6bbbe77b46..aa9bf00749c1 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -503,41 +503,6 @@ int unhandled_signal(struct task_struct *tsk, int sig) return !tsk->ptrace; } -/* - * Notify the system that a driver wants to block all signals for this - * process, and wants to be notified if any signals at all were to be - * sent/acted upon. If the notifier routine returns non-zero, then the - * signal will be acted upon after all. If the notifier routine returns 0, - * then then signal will be blocked. Only one block per process is - * allowed. priv is a pointer to private data that the notifier routine - * can use to determine if the signal should be blocked or not. - */ -void -block_all_signals(int (*notifier)(void *priv), void *priv, sigset_t *mask) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier_mask = mask; - current->notifier_data = priv; - current->notifier = notifier; - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - -/* Notify the system that blocking has ended. */ - -void -unblock_all_signals(void) -{ - unsigned long flags; - - spin_lock_irqsave(¤t->sighand->siglock, flags); - current->notifier = NULL; - current->notifier_data = NULL; - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); -} - static void collect_signal(int sig, struct sigpending *list, siginfo_t *info) { struct sigqueue *q, *first = NULL; @@ -580,19 +545,8 @@ static int __dequeue_signal(struct sigpending *pending, sigset_t *mask, { int sig = next_signal(pending, mask); - if (sig) { - if (current->notifier) { - if (sigismember(current->notifier_mask, sig)) { - if (!(current->notifier)(current->notifier_data)) { - clear_thread_flag(TIF_SIGPENDING); - return 0; - } - } - } - + if (sig) collect_signal(sig, pending, info); - } - return sig; } @@ -834,7 +788,7 @@ static bool prepare_signal(int sig, struct task_struct *p, bool force) sigset_t flush; if (signal->flags & (SIGNAL_GROUP_EXIT | SIGNAL_GROUP_COREDUMP)) { - if (signal->flags & SIGNAL_GROUP_COREDUMP) + if (!(signal->flags & SIGNAL_GROUP_EXIT)) return sig == SIGKILL; /* * The process is in the middle of dying, nothing to do. @@ -2483,9 +2437,6 @@ EXPORT_SYMBOL(force_sig); EXPORT_SYMBOL(send_sig); EXPORT_SYMBOL(send_sig_info); EXPORT_SYMBOL(sigprocmask); -EXPORT_SYMBOL(block_all_signals); -EXPORT_SYMBOL(unblock_all_signals); - /* * System call entry points. @@ -2758,6 +2709,10 @@ int copy_siginfo_to_user(siginfo_t __user *to, const siginfo_t *from) err |= __put_user(from->si_upper, &to->si_upper); } #endif +#ifdef SEGV_PKUERR + if (from->si_signo == SIGSEGV && from->si_code == SEGV_PKUERR) + err |= __put_user(from->si_pkey, &to->si_pkey); +#endif break; case __SI_CHLD: err |= __put_user(from->si_pid, &to->si_pid); @@ -3552,13 +3507,15 @@ SYSCALL_DEFINE0(pause) #endif -int sigsuspend(sigset_t *set) +static int sigsuspend(sigset_t *set) { current->saved_sigmask = current->blocked; set_current_blocked(set); - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!signal_pending(current)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } set_restore_sigmask(); return -ERESTARTNOHAND; } @@ -3628,6 +3585,10 @@ __weak const char *arch_vma_name(struct vm_area_struct *vma) void __init signals_init(void) { + /* If this check fails, the __ARCH_SI_PREAMBLE_SIZE value is wrong! */ + BUILD_BUG_ON(__ARCH_SI_PREAMBLE_SIZE + != offsetof(struct siginfo, _sifields._pad)); + sigqueue_cachep = KMEM_CACHE(sigqueue, SLAB_PANIC); } diff --git a/kernel/smp.c b/kernel/smp.c index 07854477c164..74165443c240 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -105,13 +105,12 @@ void __init call_function_init(void) * previous function call. For multi-cpu calls its even more interesting * as we'll have to ensure no other cpu is observing our csd. */ -static void csd_lock_wait(struct call_single_data *csd) +static __always_inline void csd_lock_wait(struct call_single_data *csd) { - while (smp_load_acquire(&csd->flags) & CSD_FLAG_LOCK) - cpu_relax(); + smp_cond_acquire(!(csd->flags & CSD_FLAG_LOCK)); } -static void csd_lock(struct call_single_data *csd) +static __always_inline void csd_lock(struct call_single_data *csd) { csd_lock_wait(csd); csd->flags |= CSD_FLAG_LOCK; @@ -124,7 +123,7 @@ static void csd_lock(struct call_single_data *csd) smp_wmb(); } -static void csd_unlock(struct call_single_data *csd) +static __always_inline void csd_unlock(struct call_single_data *csd) { WARN_ON(!(csd->flags & CSD_FLAG_LOCK)); @@ -569,6 +568,7 @@ void __init smp_init(void) unsigned int cpu; idle_threads_init(); + cpuhp_threads_init(); /* FIXME: This should be done in userspace --RR */ for_each_present_cpu(cpu) { @@ -669,7 +669,7 @@ void on_each_cpu_cond(bool (*cond_func)(int cpu, void *info), cpumask_var_t cpus; int cpu, ret; - might_sleep_if(gfp_flags & __GFP_WAIT); + might_sleep_if(gfpflags_allow_blocking(gfp_flags)); if (likely(zalloc_cpumask_var(&cpus, (gfp_flags|__GFP_NOWARN)))) { preempt_disable(); diff --git a/kernel/smpboot.c b/kernel/smpboot.c index d264f59bff56..13bc43d1fb22 100644 --- a/kernel/smpboot.c +++ b/kernel/smpboot.c @@ -226,7 +226,7 @@ static void smpboot_unpark_thread(struct smp_hotplug_thread *ht, unsigned int cp kthread_unpark(tsk); } -void smpboot_unpark_threads(unsigned int cpu) +int smpboot_unpark_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -235,6 +235,7 @@ void smpboot_unpark_threads(unsigned int cpu) if (cpumask_test_cpu(cpu, cur->cpumask)) smpboot_unpark_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) @@ -245,7 +246,7 @@ static void smpboot_park_thread(struct smp_hotplug_thread *ht, unsigned int cpu) kthread_park(tsk); } -void smpboot_park_threads(unsigned int cpu) +int smpboot_park_threads(unsigned int cpu) { struct smp_hotplug_thread *cur; @@ -253,6 +254,7 @@ void smpboot_park_threads(unsigned int cpu) list_for_each_entry_reverse(cur, &hotplug_threads, list) smpboot_park_thread(cur, cpu); mutex_unlock(&smpboot_threads_lock); + return 0; } static void smpboot_destroy_threads(struct smp_hotplug_thread *ht) diff --git a/kernel/smpboot.h b/kernel/smpboot.h index 72415a0eb955..485b81cfab34 100644 --- a/kernel/smpboot.h +++ b/kernel/smpboot.h @@ -14,7 +14,9 @@ static inline void idle_threads_init(void) { } #endif int smpboot_create_threads(unsigned int cpu); -void smpboot_park_threads(unsigned int cpu); -void smpboot_unpark_threads(unsigned int cpu); +int smpboot_park_threads(unsigned int cpu); +int smpboot_unpark_threads(unsigned int cpu); + +void __init cpuhp_threads_init(void); #endif diff --git a/kernel/softirq.c b/kernel/softirq.c index 479e4436f787..17caf4b63342 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -116,9 +116,9 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) if (preempt_count() == cnt) { #ifdef CONFIG_DEBUG_PREEMPT - current->preempt_disable_ip = get_parent_ip(CALLER_ADDR1); + current->preempt_disable_ip = get_lock_parent_ip(); #endif - trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1)); + trace_preempt_off(CALLER_ADDR0, get_lock_parent_ip()); } } EXPORT_SYMBOL(__local_bh_disable_ip); @@ -227,7 +227,7 @@ static inline bool lockdep_softirq_start(void) { return false; } static inline void lockdep_softirq_end(bool in_hardirq) { } #endif -asmlinkage __visible void __do_softirq(void) +asmlinkage __visible void __softirq_entry __do_softirq(void) { unsigned long end = jiffies + MAX_SOFTIRQ_TIME; unsigned long old_flags = current->flags; diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index 867bc20e1ef1..a467e6c28a3b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -28,7 +28,6 @@ */ struct cpu_stop_done { atomic_t nr_todo; /* nr left to execute */ - bool executed; /* actually executed? */ int ret; /* collected return value */ struct completion completion; /* fired if nr_todo reaches 0 */ }; @@ -63,14 +62,10 @@ static void cpu_stop_init_done(struct cpu_stop_done *done, unsigned int nr_todo) } /* signal completion unless @done is NULL */ -static void cpu_stop_signal_done(struct cpu_stop_done *done, bool executed) +static void cpu_stop_signal_done(struct cpu_stop_done *done) { - if (done) { - if (executed) - done->executed = true; - if (atomic_dec_and_test(&done->nr_todo)) - complete(&done->completion); - } + if (atomic_dec_and_test(&done->nr_todo)) + complete(&done->completion); } static void __cpu_stop_queue_work(struct cpu_stopper *stopper, @@ -81,17 +76,21 @@ static void __cpu_stop_queue_work(struct cpu_stopper *stopper, } /* queue @work to @stopper. if offline, @work is completed immediately */ -static void cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) +static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); unsigned long flags; + bool enabled; spin_lock_irqsave(&stopper->lock, flags); - if (stopper->enabled) + enabled = stopper->enabled; + if (enabled) __cpu_stop_queue_work(stopper, work); - else - cpu_stop_signal_done(work->done, false); + else if (work->done) + cpu_stop_signal_done(work->done); spin_unlock_irqrestore(&stopper->lock, flags); + + return enabled; } /** @@ -124,9 +123,10 @@ int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; cpu_stop_init_done(&done, 1); - cpu_stop_queue_work(cpu, &work); + if (!cpu_stop_queue_work(cpu, &work)) + return -ENOENT; wait_for_completion(&done.completion); - return done.executed ? done.ret : -ENOENT; + return done.ret; } /* This controls the threads on each CPU. */ @@ -258,7 +258,6 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * struct cpu_stop_work work1, work2; struct multi_stop_data msdata; - preempt_disable(); msdata = (struct multi_stop_data){ .fn = fn, .data = arg, @@ -277,16 +276,11 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * if (cpu1 > cpu2) swap(cpu1, cpu2); - if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) { - preempt_enable(); + if (cpu_stop_queue_two_works(cpu1, &work1, cpu2, &work2)) return -ENOENT; - } - - preempt_enable(); wait_for_completion(&done.completion); - - return done.executed ? done.ret : -ENOENT; + return done.ret; } /** @@ -302,23 +296,28 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * * * CONTEXT: * Don't care. + * + * RETURNS: + * true if cpu_stop_work was queued successfully and @fn will be called, + * false otherwise. */ -void stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, +bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; - cpu_stop_queue_work(cpu, work_buf); + return cpu_stop_queue_work(cpu, work_buf); } /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); -static void queue_stop_cpus_work(const struct cpumask *cpumask, +static bool queue_stop_cpus_work(const struct cpumask *cpumask, cpu_stop_fn_t fn, void *arg, struct cpu_stop_done *done) { struct cpu_stop_work *work; unsigned int cpu; + bool queued = false; /* * Disable preemption while queueing to avoid getting @@ -331,9 +330,12 @@ static void queue_stop_cpus_work(const struct cpumask *cpumask, work->fn = fn; work->arg = arg; work->done = done; - cpu_stop_queue_work(cpu, work); + if (cpu_stop_queue_work(cpu, work)) + queued = true; } lg_global_unlock(&stop_cpus_lock); + + return queued; } static int __stop_cpus(const struct cpumask *cpumask, @@ -342,9 +344,10 @@ static int __stop_cpus(const struct cpumask *cpumask, struct cpu_stop_done done; cpu_stop_init_done(&done, cpumask_weight(cpumask)); - queue_stop_cpus_work(cpumask, fn, arg, &done); + if (!queue_stop_cpus_work(cpumask, fn, arg, &done)) + return -ENOENT; wait_for_completion(&done.completion); - return done.executed ? done.ret : -ENOENT; + return done.ret; } /** @@ -432,7 +435,6 @@ static void cpu_stopper_thread(unsigned int cpu) { struct cpu_stopper *stopper = &per_cpu(cpu_stopper, cpu); struct cpu_stop_work *work; - int ret; repeat: work = NULL; @@ -448,23 +450,19 @@ repeat: cpu_stop_fn_t fn = work->fn; void *arg = work->arg; struct cpu_stop_done *done = work->done; - char ksym_buf[KSYM_NAME_LEN] __maybe_unused; - - /* cpu stop callbacks are not allowed to sleep */ - preempt_disable(); + int ret; + /* cpu stop callbacks must not sleep, make in_atomic() == T */ + preempt_count_inc(); ret = fn(arg); - if (ret) - done->ret = ret; - - /* restore preemption and check it's still balanced */ - preempt_enable(); + if (done) { + if (ret) + done->ret = ret; + cpu_stop_signal_done(done); + } + preempt_count_dec(); WARN_ONCE(preempt_count(), - "cpu_stop: %s(%p) leaked preempt count\n", - kallsyms_lookup((unsigned long)fn, NULL, NULL, NULL, - ksym_buf), arg); - - cpu_stop_signal_done(done, true); + "cpu_stop: %pf(%p) leaked preempt count\n", fn, arg); goto repeat; } } @@ -531,8 +529,6 @@ static int __init cpu_stop_init(void) } early_initcall(cpu_stop_init); -#ifdef CONFIG_STOP_MACHINE - static int __stop_machine(cpu_stop_fn_t fn, void *data, const struct cpumask *cpus) { struct multi_stop_data msdata = { @@ -630,5 +626,3 @@ int stop_machine_from_inactive_cpu(cpu_stop_fn_t fn, void *data, mutex_unlock(&stop_cpus_mutex); return ret ?: done.ret; } - -#endif /* CONFIG_STOP_MACHINE */ diff --git a/kernel/sys.c b/kernel/sys.c index fa2f2f671a5c..cf8ba545c7d3 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,7 +222,7 @@ SYSCALL_DEFINE3(setpriority, int, which, int, who, int, niceval) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) error = set_one_prio(p, niceval, error); } while_each_thread(g, p); if (!uid_eq(uid, cred->uid)) @@ -290,7 +290,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) goto out_unlock; /* No processes for this user */ } do_each_thread(g, p) { - if (uid_eq(task_uid(p), uid)) { + if (uid_eq(task_uid(p), uid) && task_pid_vnr(p)) { niceval = nice_to_rlimit(task_nice(p)); if (niceval > retval) retval = niceval; @@ -1853,11 +1853,13 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data user_auxv[AT_VECTOR_SIZE - 1] = AT_NULL; } - if (prctl_map.exe_fd != (u32)-1) + if (prctl_map.exe_fd != (u32)-1) { error = prctl_set_mm_exe_file(mm, prctl_map.exe_fd); - down_read(&mm->mmap_sem); - if (error) - goto out; + if (error) + return error; + } + + down_write(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -1894,10 +1896,8 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - error = 0; -out: - up_read(&mm->mmap_sem); - return error; + up_write(&mm->mmap_sem); + return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ @@ -1963,7 +1963,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = -EINVAL; - down_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); vma = find_vma(mm, addr); prctl_map.start_code = mm->start_code; @@ -2056,7 +2056,7 @@ static int prctl_set_mm(int opt, unsigned long addr, error = 0; out: - up_read(&mm->mmap_sem); + up_write(&mm->mmap_sem); return error; } @@ -2169,7 +2169,10 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = perf_event_task_enable(); break; case PR_GET_TIMERSLACK: - error = current->timer_slack_ns; + if (current->timer_slack_ns > ULONG_MAX) + error = ULONG_MAX; + else + error = current->timer_slack_ns; break; case PR_SET_TIMERSLACK: if (arg2 <= 0) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 0623787ec67a..2c5e3a8e00d7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -174,6 +174,7 @@ cond_syscall(sys_setfsuid); cond_syscall(sys_setfsgid); cond_syscall(sys_capget); cond_syscall(sys_capset); +cond_syscall(sys_copy_file_range); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index dc6858d6639e..725587f10667 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -126,6 +126,7 @@ static int __maybe_unused two = 2; static int __maybe_unused four = 4; static unsigned long one_ul = 1; static int one_hundred = 100; +static int one_thousand = 1000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -173,7 +174,7 @@ extern int no_unaligned_warning; #define SYSCTL_WRITES_WARN 0 #define SYSCTL_WRITES_STRICT 1 -static int sysctl_writes_strict = SYSCTL_WRITES_WARN; +static int sysctl_writes_strict = SYSCTL_WRITES_STRICT; static int proc_do_cad_pid(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos); @@ -350,6 +351,17 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, +#ifdef CONFIG_SCHEDSTATS + { + .procname = "sched_schedstats", + .data = NULL, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = sysctl_schedstats, + .extra1 = &zero, + .extra2 = &one, + }, +#endif /* CONFIG_SCHEDSTATS */ #endif /* CONFIG_SMP */ #ifdef CONFIG_NUMA_BALANCING { @@ -505,7 +517,7 @@ static struct ctl_table kern_table[] = { .data = &latencytop_enabled, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = sysctl_latencytop, }, #endif #ifdef CONFIG_BLK_DEV_INITRD @@ -1393,6 +1405,15 @@ static struct ctl_table vm_table[] = { .extra1 = &zero, }, { + .procname = "watermark_scale_factor", + .data = &watermark_scale_factor, + .maxlen = sizeof(watermark_scale_factor), + .mode = 0644, + .proc_handler = watermark_scale_factor_sysctl_handler, + .extra1 = &one, + .extra2 = &one_thousand, + }, + { .procname = "percpu_pagelist_fraction", .data = &percpu_pagelist_fraction, .maxlen = sizeof(percpu_pagelist_fraction), @@ -1568,6 +1589,28 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_doulongvec_minmax, }, +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_BITS + { + .procname = "mmap_rnd_bits", + .data = &mmap_rnd_bits, + .maxlen = sizeof(mmap_rnd_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_bits_min, + .extra2 = (void *)&mmap_rnd_bits_max, + }, +#endif +#ifdef CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS + { + .procname = "mmap_rnd_compat_bits", + .data = &mmap_rnd_compat_bits, + .maxlen = sizeof(mmap_rnd_compat_bits), + .mode = 0600, + .proc_handler = proc_dointvec_minmax, + .extra1 = (void *)&mmap_rnd_compat_bits_min, + .extra2 = (void *)&mmap_rnd_compat_bits_max, + }, +#endif { } }; @@ -1735,6 +1778,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; @@ -2047,9 +2104,8 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, void *data) { int *i, vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!tbl_data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2078,15 +2134,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first=0) { @@ -2094,11 +2144,11 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, bool neg; if (write) { - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); if (!left) break; - err = proc_get_long(&kbuf, &left, &lval, &neg, + err = proc_get_long(&p, &left, &lval, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2125,10 +2175,9 @@ static int __do_proc_dointvec(void *tbl_data, struct ctl_table *table, if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err && left) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2310,9 +2359,8 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int { unsigned long *i, *min, *max; int vleft, first = 1, err = 0; - unsigned long page = 0; size_t left; - char *kbuf; + char *kbuf = NULL, *p; if (!data || !table->maxlen || !*lenp || (*ppos && !write)) { *lenp = 0; @@ -2340,15 +2388,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - err = -EFAULT; - goto free; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); } for (; left && vleft--; i++, first = 0) { @@ -2357,9 +2399,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (write) { bool neg; - left -= proc_skip_spaces(&kbuf); + left -= proc_skip_spaces(&p); - err = proc_get_long(&kbuf, &left, &val, &neg, + err = proc_get_long(&p, &left, &val, &neg, proc_wspace_sep, sizeof(proc_wspace_sep), NULL); if (err) @@ -2385,10 +2427,9 @@ static int __do_proc_doulongvec_minmax(void *data, struct ctl_table *table, int if (!write && !first && left && !err) err = proc_put_char(&buffer, &left, '\n'); if (write && !err) - left -= proc_skip_spaces(&kbuf); -free: + left -= proc_skip_spaces(&p); if (write) { - free_page(page); + kfree(kbuf); if (first) return err ? : -EINVAL; } @@ -2650,34 +2691,27 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, } if (write) { - unsigned long page = 0; - char *kbuf; + char *kbuf, *p; if (left > PAGE_SIZE - 1) left = PAGE_SIZE - 1; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!kbuf) - return -ENOMEM; - if (copy_from_user(kbuf, buffer, left)) { - free_page(page); - return -EFAULT; - } - kbuf[left] = 0; + p = kbuf = memdup_user_nul(buffer, left); + if (IS_ERR(kbuf)) + return PTR_ERR(kbuf); tmp_bitmap = kzalloc(BITS_TO_LONGS(bitmap_len) * sizeof(unsigned long), GFP_KERNEL); if (!tmp_bitmap) { - free_page(page); + kfree(kbuf); return -ENOMEM; } - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); while (!err && left) { unsigned long val_a, val_b; bool neg; - err = proc_get_long(&kbuf, &left, &val_a, &neg, tr_a, + err = proc_get_long(&p, &left, &val_a, &neg, tr_a, sizeof(tr_a), &c); if (err) break; @@ -2688,12 +2722,12 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, val_b = val_a; if (left) { - kbuf++; + p++; left--; } if (c == '-') { - err = proc_get_long(&kbuf, &left, &val_b, + err = proc_get_long(&p, &left, &val_b, &neg, tr_b, sizeof(tr_b), &c); if (err) @@ -2704,16 +2738,16 @@ int proc_do_large_bitmap(struct ctl_table *table, int write, break; } if (left) { - kbuf++; + p++; left--; } } bitmap_set(tmp_bitmap, val_a, val_b - val_a + 1); first = 0; - proc_skip_char(&kbuf, &left, '\n'); + proc_skip_char(&p, &left, '\n'); } - free_page(page); + kfree(kbuf); } else { unsigned long bit_a, bit_b = 0; diff --git a/kernel/sysctl_binary.c b/kernel/sysctl_binary.c index 7e7746a42a62..10a1d7dc9313 100644 --- a/kernel/sysctl_binary.c +++ b/kernel/sysctl_binary.c @@ -1321,7 +1321,7 @@ static ssize_t binary_sysctl(const int *name, int nlen, } mnt = task_active_pid_ns(current)->proc_mnt; - file = file_open_root(mnt->mnt_root, mnt, pathname, flags); + file = file_open_root(mnt->mnt_root, mnt, pathname, flags, 0); result = PTR_ERR(file); if (IS_ERR(file)) goto out_putname; diff --git a/kernel/time/alarmtimer.c b/kernel/time/alarmtimer.c index 7fbba635a549..e840ed867a5d 100644 --- a/kernel/time/alarmtimer.c +++ b/kernel/time/alarmtimer.c @@ -271,11 +271,27 @@ static int alarmtimer_suspend(struct device *dev) __pm_wakeup_event(ws, MSEC_PER_SEC); return ret; } + +static int alarmtimer_resume(struct device *dev) +{ + struct rtc_device *rtc; + + rtc = alarmtimer_get_rtcdev(); + if (rtc) + rtc_timer_cancel(rtc, &rtctimer); + return 0; +} + #else static int alarmtimer_suspend(struct device *dev) { return 0; } + +static int alarmtimer_resume(struct device *dev) +{ + return 0; +} #endif static void alarmtimer_freezerset(ktime_t absexp, enum alarmtimer_type type) @@ -800,6 +816,7 @@ out: /* Suspend hook structures */ static const struct dev_pm_ops alarmtimer_pm_ops = { .suspend = alarmtimer_suspend, + .resume = alarmtimer_resume, }; static struct platform_driver alarmtimer_driver = { diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index 0d8fe8b8f727..56ece145a814 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -217,9 +217,9 @@ static void clocksource_watchdog(unsigned long data) continue; /* Check the deviation from the watchdog clocksource. */ - if (abs64(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { - pr_warn("timekeeping watchdog: Marking clocksource '%s' as unstable because the skew is too large:\n", - cs->name); + if (abs(cs_nsec - wd_nsec) > WATCHDOG_THRESHOLD) { + pr_warn("timekeeping watchdog on CPU%d: Marking clocksource '%s' as unstable because the skew is too large:\n", + smp_processor_id(), cs->name); pr_warn(" '%s' wd_now: %llx wd_last: %llx mask: %llx\n", watchdog->name, wdnow, wdlast, watchdog->mask); pr_warn(" '%s' cs_now: %llx cs_last: %llx mask: %llx\n", @@ -323,13 +323,42 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) /* cs is a watchdog. */ if (cs->flags & CLOCK_SOURCE_IS_CONTINUOUS) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; + } + spin_unlock_irqrestore(&watchdog_lock, flags); +} + +static void clocksource_select_watchdog(bool fallback) +{ + struct clocksource *cs, *old_wd; + unsigned long flags; + + spin_lock_irqsave(&watchdog_lock, flags); + /* save current watchdog */ + old_wd = watchdog; + if (fallback) + watchdog = NULL; + + list_for_each_entry(cs, &clocksource_list, list) { + /* cs is a clocksource to be watched. */ + if (cs->flags & CLOCK_SOURCE_MUST_VERIFY) + continue; + + /* Skip current if we were requested for a fallback. */ + if (fallback && cs == old_wd) + continue; + /* Pick the best watchdog. */ - if (!watchdog || cs->rating > watchdog->rating) { + if (!watchdog || cs->rating > watchdog->rating) watchdog = cs; - /* Reset watchdog cycles */ - clocksource_reset_watchdog(); - } } + /* If we failed to find a fallback restore the old one. */ + if (!watchdog) + watchdog = old_wd; + + /* If we changed the watchdog we need to reset cycles. */ + if (watchdog != old_wd) + clocksource_reset_watchdog(); + /* Check if the watchdog timer needs to be started. */ clocksource_start_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); @@ -404,6 +433,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) cs->flags |= CLOCK_SOURCE_VALID_FOR_HRES; } +static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } static inline int __clocksource_watchdog_kthread(void) { return 0; } @@ -736,6 +766,7 @@ int __clocksource_register_scale(struct clocksource *cs, u32 scale, u32 freq) clocksource_enqueue(cs); clocksource_enqueue_watchdog(cs); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); return 0; } @@ -758,6 +789,7 @@ void clocksource_change_rating(struct clocksource *cs, int rating) mutex_lock(&clocksource_mutex); __clocksource_change_rating(cs, rating); clocksource_select(); + clocksource_select_watchdog(false); mutex_unlock(&clocksource_mutex); } EXPORT_SYMBOL(clocksource_change_rating); @@ -767,12 +799,12 @@ EXPORT_SYMBOL(clocksource_change_rating); */ static int clocksource_unbind(struct clocksource *cs) { - /* - * I really can't convince myself to support this on hardware - * designed by lobotomized monkeys. - */ - if (clocksource_is_watchdog(cs)) - return -EBUSY; + if (clocksource_is_watchdog(cs)) { + /* Select and try to install a replacement watchdog. */ + clocksource_select_watchdog(true); + if (clocksource_is_watchdog(cs)) + return -EBUSY; + } if (cs == curr_clocksource) { /* Select and try to install a replacement clock source */ diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..fa0b983290cf 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -515,7 +515,7 @@ static inline ktime_t hrtimer_update_base(struct hrtimer_cpu_base *base) /* * High resolution timer enabled ? */ -static int hrtimer_hres_enabled __read_mostly = 1; +static bool hrtimer_hres_enabled __read_mostly = true; unsigned int hrtimer_resolution __read_mostly = LOW_RES_NSEC; EXPORT_SYMBOL_GPL(hrtimer_resolution); @@ -524,13 +524,7 @@ EXPORT_SYMBOL_GPL(hrtimer_resolution); */ static int __init setup_hrtimer_hres(char *str) { - if (!strcmp(str, "off")) - hrtimer_hres_enabled = 0; - else if (!strcmp(str, "on")) - hrtimer_hres_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &hrtimer_hres_enabled) == 0); } __setup("highres=", setup_hrtimer_hres); @@ -897,10 +891,10 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) + u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - unsigned int state = timer->state; + u8 state = timer->state; timer->state = newstate; if (!(state & HRTIMER_STATE_ENQUEUED)) @@ -930,7 +924,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state = timer->state; + u8 state = timer->state; int reprogram; /* @@ -954,6 +948,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest return 0; } +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); +#endif + return tim; +} + /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added @@ -963,7 +973,7 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest * relative (HRTIMER_MODE_REL) */ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, - unsigned long delta_ns, const enum hrtimer_mode mode) + u64 delta_ns, const enum hrtimer_mode mode) { struct hrtimer_clock_base *base, *new_base; unsigned long flags; @@ -974,19 +984,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); - if (mode & HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); -#endif - } + + tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); @@ -1074,19 +1075,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** @@ -1220,6 +1225,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, fn = timer->function; /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + + /* * Because we run timers from hardirq context, there is no chance * they get migrated to another cpu, therefore its safe to unlock * the timer base. @@ -1529,7 +1542,7 @@ long hrtimer_nanosleep(struct timespec *rqtp, struct timespec __user *rmtp, struct restart_block *restart; struct hrtimer_sleeper t; int ret = 0; - unsigned long slack; + u64 slack; slack = current->timer_slack_ns; if (dl_task(current) || rt_task(current)) @@ -1705,7 +1718,7 @@ void __init hrtimers_init(void) * @clock: timer clock, CLOCK_MONOTONIC or CLOCK_REALTIME */ int __sched -schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, +schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, const enum hrtimer_mode mode, int clock) { struct hrtimer_sleeper t; @@ -1773,7 +1786,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, unsigned long delta, * * Returns 0 when the timer has expired otherwise -EINTR */ -int __sched schedule_hrtimeout_range(ktime_t *expires, unsigned long delta, +int __sched schedule_hrtimeout_range(ktime_t *expires, u64 delta, const enum hrtimer_mode mode) { return schedule_hrtimeout_range_clock(expires, delta, mode, diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8d262b467573..1d5c7204ddc9 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ */ static struct timeval itimer_get_remtime(struct hrtimer *timer) { - ktime_t rem = hrtimer_get_remaining(timer); + ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above diff --git a/kernel/time/jiffies.c b/kernel/time/jiffies.c index 347fecf86a3f..555e21f7b966 100644 --- a/kernel/time/jiffies.c +++ b/kernel/time/jiffies.c @@ -68,7 +68,7 @@ static struct clocksource clocksource_jiffies = { .name = "jiffies", .rating = 1, /* lowest valid rating*/ .read = jiffies_read, - .mask = 0xffffffff, /*32bits*/ + .mask = CLOCKSOURCE_MASK(32), .mult = NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */ .shift = JIFFIES_SHIFT, .max_cycles = 10, diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 149cc8086aea..6df8927c58a5 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -16,8 +16,11 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/rtc.h> +#include <linux/math64.h> #include "ntp_internal.h" +#include "timekeeping_internal.h" + /* * NTP timekeeping variables: @@ -70,7 +73,7 @@ static long time_esterror = NTP_PHASE_LIMIT; static s64 time_freq; /* time at last adjustment (secs): */ -static long time_reftime; +static time64_t time_reftime; static long time_adjust; @@ -297,25 +300,27 @@ static void ntp_update_offset(long offset) if (!(time_status & STA_PLL)) return; - if (!(time_status & STA_NANO)) + if (!(time_status & STA_NANO)) { + /* Make sure the multiplication below won't overflow */ + offset = clamp(offset, -USEC_PER_SEC, USEC_PER_SEC); offset *= NSEC_PER_USEC; + } /* * Scale the phase adjustment and * clamp to the operating range. */ - offset = min(offset, MAXPHASE); - offset = max(offset, -MAXPHASE); + offset = clamp(offset, -MAXPHASE, MAXPHASE); /* * Select how the frequency is to be controlled * and in which mode (PLL or FLL). */ - secs = get_seconds() - time_reftime; + secs = (long)(__ktime_get_real_seconds() - time_reftime); if (unlikely(time_status & STA_FREQHOLD)) secs = 0; - time_reftime = get_seconds(); + time_reftime = __ktime_get_real_seconds(); offset64 = offset; freq_adj = ntp_update_offset_fll(offset64, secs); @@ -390,10 +395,11 @@ ktime_t ntp_get_next_leap(void) * * Also handles leap second processing, and returns leap offset */ -int second_overflow(unsigned long secs) +int second_overflow(time64_t secs) { s64 delta; int leap = 0; + s32 rem; /* * Leap second processing. If in leap-insert state at the end of the @@ -404,19 +410,19 @@ int second_overflow(unsigned long secs) case TIME_OK: if (time_status & STA_INS) { time_state = TIME_INS; - ntp_next_leap_sec = secs + SECS_PER_DAY - - (secs % SECS_PER_DAY); + div_s64_rem(secs, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } else if (time_status & STA_DEL) { time_state = TIME_DEL; - ntp_next_leap_sec = secs + SECS_PER_DAY - - ((secs+1) % SECS_PER_DAY); + div_s64_rem(secs + 1, SECS_PER_DAY, &rem); + ntp_next_leap_sec = secs + SECS_PER_DAY - rem; } break; case TIME_INS: if (!(time_status & STA_INS)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - } else if (secs % SECS_PER_DAY == 0) { + } else if (secs == ntp_next_leap_sec) { leap = -1; time_state = TIME_OOP; printk(KERN_NOTICE @@ -427,7 +433,7 @@ int second_overflow(unsigned long secs) if (!(time_status & STA_DEL)) { ntp_next_leap_sec = TIME64_MAX; time_state = TIME_OK; - } else if ((secs + 1) % SECS_PER_DAY == 0) { + } else if (secs == ntp_next_leap_sec) { leap = 1; ntp_next_leap_sec = TIME64_MAX; time_state = TIME_WAIT; @@ -590,7 +596,7 @@ static inline void process_adj_status(struct timex *txc, struct timespec64 *ts) * reference time to current time. */ if (!(time_status & STA_PLL) && (txc->status & STA_PLL)) - time_reftime = get_seconds(); + time_reftime = __ktime_get_real_seconds(); /* only set allowed bits */ time_status &= STA_RONLY; @@ -674,8 +680,24 @@ int ntp_validate_timex(struct timex *txc) return -EINVAL; } - if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) - return -EPERM; + if (txc->modes & ADJ_SETOFFSET) { + /* In order to inject time, you gotta be super-user! */ + if (!capable(CAP_SYS_TIME)) + return -EPERM; + + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } + } /* * Check for potential multiplication overflows that can diff --git a/kernel/time/ntp_internal.h b/kernel/time/ntp_internal.h index af924470eac0..d8a7c11fa71a 100644 --- a/kernel/time/ntp_internal.h +++ b/kernel/time/ntp_internal.h @@ -6,7 +6,7 @@ extern void ntp_clear(void); /* Returns how long ticks are at present, in ns / 2^NTP_SCALE_SHIFT. */ extern u64 ntp_tick_length(void); extern ktime_t ntp_get_next_leap(void); -extern int second_overflow(unsigned long secs); +extern int second_overflow(time64_t secs); extern int ntp_validate_timex(struct timex *); extern int __do_adjtimex(struct timex *, struct timespec64 *, s32 *); extern void __hardpps(const struct timespec64 *, const struct timespec64 *); diff --git a/kernel/time/posix-clock.c b/kernel/time/posix-clock.c index ce033c7aa2e8..9cff0ab82b63 100644 --- a/kernel/time/posix-clock.c +++ b/kernel/time/posix-clock.c @@ -69,10 +69,10 @@ static ssize_t posix_clock_read(struct file *fp, char __user *buf, static unsigned int posix_clock_poll(struct file *fp, poll_table *wait) { struct posix_clock *clk = get_posix_clock(fp); - int result = 0; + unsigned int result = 0; if (!clk) - return -ENODEV; + return POLLERR; if (clk->ops.poll) result = clk->ops.poll(clk, fp, wait); diff --git a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c index f5e86d282d52..1cafba860b08 100644 --- a/kernel/time/posix-cpu-timers.c +++ b/kernel/time/posix-cpu-timers.c @@ -333,7 +333,6 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp) return err; } - /* * Validate the clockid_t for a new CPU-clock timer, and initialize the timer. * This is called from sys_timer_create() and do_cpu_nanosleep() with the @@ -517,6 +516,10 @@ static void arm_timer(struct k_itimer *timer) cputime_expires->sched_exp = exp; break; } + if (CPUCLOCK_PERTHREAD(timer->it_clock)) + tick_dep_set_task(p, TICK_DEP_BIT_POSIX_TIMER); + else + tick_dep_set_signal(p->signal, TICK_DEP_BIT_POSIX_TIMER); } } @@ -582,39 +585,6 @@ static int cpu_timer_sample_group(const clockid_t which_clock, return 0; } -#ifdef CONFIG_NO_HZ_FULL -static void nohz_kick_work_fn(struct work_struct *work) -{ - tick_nohz_full_kick_all(); -} - -static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn); - -/* - * We need the IPIs to be sent from sane process context. - * The posix cpu timers are always set with irqs disabled. - */ -static void posix_cpu_timer_kick_nohz(void) -{ - if (context_tracking_is_enabled()) - schedule_work(&nohz_kick_work); -} - -bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk) -{ - if (!task_cputime_zero(&tsk->cputime_expires)) - return false; - - /* Check if cputimer is running. This is accessed without locking. */ - if (READ_ONCE(tsk->signal->cputimer.running)) - return false; - - return true; -} -#else -static inline void posix_cpu_timer_kick_nohz(void) { } -#endif - /* * Guts of sys_timer_settime for CPU timers. * This is called with the timer locked and interrupts disabled. @@ -761,8 +731,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags, sample_to_timespec(timer->it_clock, old_incr, &old->it_interval); } - if (!ret) - posix_cpu_timer_kick_nohz(); + return ret; } @@ -911,6 +880,8 @@ static void check_thread_timers(struct task_struct *tsk, __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk); } } + if (task_cputime_zero(tsk_expires)) + tick_dep_clear_task(tsk, TICK_DEP_BIT_POSIX_TIMER); } static inline void stop_process_timers(struct signal_struct *sig) @@ -919,6 +890,7 @@ static inline void stop_process_timers(struct signal_struct *sig) /* Turn off cputimer->running. This is done without locking. */ WRITE_ONCE(cputimer->running, false); + tick_dep_clear_signal(sig, TICK_DEP_BIT_POSIX_TIMER); } static u32 onecputick; @@ -1095,8 +1067,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer) arm_timer(timer); unlock_task_sighand(p, &flags); - /* Kick full dynticks CPUs in case they need to tick on the new timer */ - posix_cpu_timer_kick_nohz(); out: timer->it_overrun_last = timer->it_overrun; timer->it_overrun = -1; @@ -1270,7 +1240,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, } if (!*newval) - goto out; + return; *newval += now; } @@ -1288,8 +1258,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx, tsk->signal->cputime_expires.virt_exp = *newval; break; } -out: - posix_cpu_timer_kick_nohz(); + + tick_dep_set_signal(tsk->signal, TICK_DEP_BIT_POSIX_TIMER); } static int do_cpu_nanosleep(const clockid_t which_clock, int flags, diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31d11ac9fa47..f2826c35e918 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(hrtimer_get_expires(timer), now); + remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 7c7ec4515983..58e3310c9b21 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -22,7 +22,6 @@ #include <linux/module.h> #include <linux/irq_work.h> #include <linux/posix-timers.h> -#include <linux/perf_event.h> #include <linux/context_tracking.h> #include <asm/irq_regs.h> @@ -36,16 +35,17 @@ */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); -/* - * The time, when the last jiffy update happened. Protected by jiffies_lock. - */ -static ktime_t last_jiffies_update; - struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + /* * Must be called with interrupts disabled ! */ @@ -143,7 +143,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) * when we go busy again does not account too much ticks. */ if (ts->tick_stopped) { - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); if (is_idle_task(current)) ts->idle_jiffies++; } @@ -151,59 +151,67 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } +#endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; cpumask_var_t housekeeping_mask; bool tick_nohz_full_running; +static atomic_t tick_dep_mask; -static bool can_stop_full_tick(void) +static bool check_tick_dependency(atomic_t *dep) +{ + int val = atomic_read(dep); + + if (val & TICK_DEP_MASK_POSIX_TIMER) { + trace_tick_stop(0, TICK_DEP_MASK_POSIX_TIMER); + return true; + } + + if (val & TICK_DEP_MASK_PERF_EVENTS) { + trace_tick_stop(0, TICK_DEP_MASK_PERF_EVENTS); + return true; + } + + if (val & TICK_DEP_MASK_SCHED) { + trace_tick_stop(0, TICK_DEP_MASK_SCHED); + return true; + } + + if (val & TICK_DEP_MASK_CLOCK_UNSTABLE) { + trace_tick_stop(0, TICK_DEP_MASK_CLOCK_UNSTABLE); + return true; + } + + return false; +} + +static bool can_stop_full_tick(struct tick_sched *ts) { WARN_ON_ONCE(!irqs_disabled()); - if (!sched_can_stop_tick()) { - trace_tick_stop(0, "more than 1 task in runqueue\n"); + if (check_tick_dependency(&tick_dep_mask)) return false; - } - if (!posix_cpu_timers_can_stop_tick(current)) { - trace_tick_stop(0, "posix timers running\n"); + if (check_tick_dependency(&ts->tick_dep_mask)) return false; - } - if (!perf_event_can_stop_tick()) { - trace_tick_stop(0, "perf events running\n"); + if (check_tick_dependency(¤t->tick_dep_mask)) return false; - } - /* sched_clock_tick() needs us? */ -#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK - /* - * TODO: kick full dynticks CPUs when - * sched_clock_stable is set. - */ - if (!sched_clock_stable()) { - trace_tick_stop(0, "unstable sched clock\n"); - /* - * Don't allow the user to think they can get - * full NO_HZ with this machine. - */ - WARN_ONCE(tick_nohz_full_running, - "NO_HZ FULL will not work with unstable sched clock"); + if (check_tick_dependency(¤t->signal->tick_dep_mask)) return false; - } -#endif return true; } -static void nohz_full_kick_work_func(struct irq_work *work) +static void nohz_full_kick_func(struct irq_work *work) { /* Empty, the tick restart happens on tick_nohz_irq_exit() */ } static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { - .func = nohz_full_kick_work_func, + .func = nohz_full_kick_func, }; /* @@ -212,7 +220,7 @@ static DEFINE_PER_CPU(struct irq_work, nohz_full_kick_work) = { * This kick, unlike tick_nohz_full_kick_cpu() and tick_nohz_full_kick_all(), * is NMI safe. */ -void tick_nohz_full_kick(void) +static void tick_nohz_full_kick(void) { if (!tick_nohz_full_cpu(smp_processor_id())) return; @@ -232,27 +240,112 @@ void tick_nohz_full_kick_cpu(int cpu) irq_work_queue_on(&per_cpu(nohz_full_kick_work, cpu), cpu); } -static void nohz_full_kick_ipi(void *info) -{ - /* Empty, the tick restart happens on tick_nohz_irq_exit() */ -} - /* * Kick all full dynticks CPUs in order to force these to re-evaluate * their dependency on the tick and restart it if necessary. */ -void tick_nohz_full_kick_all(void) +static void tick_nohz_full_kick_all(void) { + int cpu; + if (!tick_nohz_full_running) return; preempt_disable(); - smp_call_function_many(tick_nohz_full_mask, - nohz_full_kick_ipi, NULL, false); - tick_nohz_full_kick(); + for_each_cpu_and(cpu, tick_nohz_full_mask, cpu_online_mask) + tick_nohz_full_kick_cpu(cpu); preempt_enable(); } +static void tick_nohz_dep_set_all(atomic_t *dep, + enum tick_dep_bits bit) +{ + int prev; + + prev = atomic_fetch_or(dep, BIT(bit)); + if (!prev) + tick_nohz_full_kick_all(); +} + +/* + * Set a global tick dependency. Used by perf events that rely on freq and + * by unstable clock. + */ +void tick_nohz_dep_set(enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&tick_dep_mask, bit); +} + +void tick_nohz_dep_clear(enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &tick_dep_mask); +} + +/* + * Set per-CPU tick dependency. Used by scheduler and perf events in order to + * manage events throttling. + */ +void tick_nohz_dep_set_cpu(int cpu, enum tick_dep_bits bit) +{ + int prev; + struct tick_sched *ts; + + ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + prev = atomic_fetch_or(&ts->tick_dep_mask, BIT(bit)); + if (!prev) { + preempt_disable(); + /* Perf needs local kick that is NMI safe */ + if (cpu == smp_processor_id()) { + tick_nohz_full_kick(); + } else { + /* Remote irq work not NMI-safe */ + if (!WARN_ON_ONCE(in_nmi())) + tick_nohz_full_kick_cpu(cpu); + } + preempt_enable(); + } +} + +void tick_nohz_dep_clear_cpu(int cpu, enum tick_dep_bits bit) +{ + struct tick_sched *ts = per_cpu_ptr(&tick_cpu_sched, cpu); + + atomic_andnot(BIT(bit), &ts->tick_dep_mask); +} + +/* + * Set a per-task tick dependency. Posix CPU timers need this in order to elapse + * per task timers. + */ +void tick_nohz_dep_set_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + /* + * We could optimize this with just kicking the target running the task + * if that noise matters for nohz full users. + */ + tick_nohz_dep_set_all(&tsk->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_task(struct task_struct *tsk, enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &tsk->tick_dep_mask); +} + +/* + * Set a per-taskgroup tick dependency. Posix CPU timers need this in order to elapse + * per process timers. + */ +void tick_nohz_dep_set_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + tick_nohz_dep_set_all(&sig->tick_dep_mask, bit); +} + +void tick_nohz_dep_clear_signal(struct signal_struct *sig, enum tick_dep_bits bit) +{ + atomic_andnot(BIT(bit), &sig->tick_dep_mask); +} + /* * Re-evaluate the need for the tick as we switch the current task. * It might need the tick due to per task/process properties: @@ -261,15 +354,20 @@ void tick_nohz_full_kick_all(void) void __tick_nohz_task_switch(void) { unsigned long flags; + struct tick_sched *ts; local_irq_save(flags); if (!tick_nohz_full_cpu(smp_processor_id())) goto out; - if (tick_nohz_tick_stopped() && !can_stop_full_tick()) - tick_nohz_full_kick(); + ts = this_cpu_ptr(&tick_cpu_sched); + if (ts->tick_stopped) { + if (atomic_read(¤t->tick_dep_mask) || + atomic_read(¤t->signal->tick_dep_mask)) + tick_nohz_full_kick(); + } out: local_irq_restore(flags); } @@ -279,7 +377,7 @@ static int __init tick_nohz_full_setup(char *str) { alloc_bootmem_cpumask_var(&tick_nohz_full_mask); if (cpulist_parse(str, tick_nohz_full_mask) < 0) { - pr_warning("NOHZ: Incorrect nohz_full cpumask\n"); + pr_warn("NO_HZ: Incorrect nohz_full cpumask\n"); free_bootmem_cpumask_var(tick_nohz_full_mask); return 1; } @@ -347,8 +445,7 @@ void __init tick_nohz_init(void) * interrupts to avoid circular dependency on the tick */ if (!arch_irq_work_has_interrupt()) { - pr_warning("NO_HZ: Can't run full dynticks because arch doesn't " - "support irq work self-IPIs\n"); + pr_warn("NO_HZ: Can't run full dynticks because arch doesn't support irq work self-IPIs\n"); cpumask_clear(tick_nohz_full_mask); cpumask_copy(housekeeping_mask, cpu_possible_mask); tick_nohz_full_running = false; @@ -358,7 +455,8 @@ void __init tick_nohz_init(void) cpu = smp_processor_id(); if (cpumask_test_cpu(cpu, tick_nohz_full_mask)) { - pr_warning("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", cpu); + pr_warn("NO_HZ: Clearing %d from nohz_full range for timekeeping\n", + cpu); cpumask_clear_cpu(cpu, tick_nohz_full_mask); } @@ -387,20 +485,14 @@ void __init tick_nohz_init(void) /* * NO HZ enabled ? */ -static int tick_nohz_enabled __read_mostly = 1; +bool tick_nohz_enabled __read_mostly = true; unsigned long tick_nohz_active __read_mostly; /* * Enable / Disable tickless mode */ static int __init setup_tick_nohz(char *str) { - if (!strcmp(str, "off")) - tick_nohz_enabled = 0; - else if (!strcmp(str, "on")) - tick_nohz_enabled = 1; - else - return 0; - return 1; + return (kstrtobool(str, &tick_nohz_enabled) == 0); } __setup("nohz=", setup_tick_nohz); @@ -430,7 +522,7 @@ static void tick_nohz_update_jiffies(ktime_t now) tick_do_update_jiffies64(now); local_irq_restore(flags); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); } /* @@ -603,15 +695,31 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, /* * If the tick is due in the next period, keep it ticking or - * restart it proper. + * force prod the timer. */ delta = next_tick - basemono; if (delta <= (u64)TICK_NSEC) { tick.tv64 = 0; + /* + * We've not stopped the tick yet, and there's a timer in the + * next period, so no point in stopping it either, bail. + */ if (!ts->tick_stopped) goto out; + + /* + * If, OTOH, we did stop it, but there's a pending (expired) + * timer reprogram the timer hardware to fire now. + * + * We will not restart the tick proper, just prod the timer + * hardware into firing an interrupt to process the pending + * timers. Just like tick_irq_exit() will not restart the tick + * for 'normal' interrupts. + * + * Only once we exit the idle loop will we re-enable the tick, + * see tick_nohz_idle_exit(). + */ if (delta == 0) { - /* Tick is stopped, but required now. Enforce it */ tick_nohz_restart(ts, now); goto out; } @@ -671,7 +779,7 @@ static ktime_t tick_nohz_stop_sched_tick(struct tick_sched *ts, ts->last_tick = hrtimer_get_expires(&ts->sched_timer); ts->tick_stopped = 1; - trace_tick_stop(1, " "); + trace_tick_stop(1, TICK_DEP_MASK_NONE); } /* @@ -694,14 +802,14 @@ out: return tick; } -static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now) +static void tick_nohz_restart_sched_tick(struct tick_sched *ts, ktime_t now, int active) { /* Update jiffies first */ tick_do_update_jiffies64(now); - update_cpu_load_nohz(); + update_cpu_load_nohz(active); calc_load_exit_idle(); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sched(); /* * Cancel the scheduled timer and restore the tick */ @@ -722,10 +830,10 @@ static void tick_nohz_full_update_tick(struct tick_sched *ts) if (!ts->tick_stopped && ts->nohz_mode == NOHZ_MODE_INACTIVE) return; - if (can_stop_full_tick()) + if (can_stop_full_tick(ts)) tick_nohz_stop_sched_tick(ts, ktime_get(), cpu); else if (ts->tick_stopped) - tick_nohz_restart_sched_tick(ts, ktime_get()); + tick_nohz_restart_sched_tick(ts, ktime_get(), 1); #endif } @@ -875,7 +983,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts) #ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE unsigned long ticks; - if (vtime_accounting_enabled()) + if (vtime_accounting_cpu_enabled()) return; /* * We stopped the tick in idle. Update process times would miss the @@ -916,7 +1024,7 @@ void tick_nohz_idle_exit(void) tick_nohz_stop_idle(ts, now); if (ts->tick_stopped) { - tick_nohz_restart_sched_tick(ts, now); + tick_nohz_restart_sched_tick(ts, now, 0); tick_nohz_account_idle_ticks(ts); } @@ -977,9 +1085,9 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); - tick_program_event(next, 1); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } diff --git a/kernel/time/tick-sched.h b/kernel/time/tick-sched.h index a4a8d4e9baa1..bf38226e5c17 100644 --- a/kernel/time/tick-sched.h +++ b/kernel/time/tick-sched.h @@ -60,6 +60,7 @@ struct tick_sched { u64 next_timer; ktime_t idle_expires; int do_timer_last; + atomic_t tick_dep_mask; }; extern struct tick_sched *tick_get_tick_sched(int cpu); diff --git a/kernel/time/time.c b/kernel/time/time.c index 86751c68e08d..be115b020d27 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -322,6 +322,13 @@ EXPORT_SYMBOL(timespec_trunc); * -year/100+year/400 terms, and add 10.] * * This algorithm was first published by Gauss (I think). + * + * A leap second can be indicated by calling this function with sec as + * 60 (allowable under ISO 8601). The leap second is treated the same + * as the following second since they don't exist in UNIX time. + * + * An encoding of midnight at the end of the day as 24:00:00 - ie. midnight + * tomorrow - (allowable under ISO 8601) is supported. */ time64_t mktime64(const unsigned int year0, const unsigned int mon0, const unsigned int day, const unsigned int hour, @@ -338,7 +345,7 @@ time64_t mktime64(const unsigned int year0, const unsigned int mon0, return ((((time64_t) (year/4 - year/100 + year/400 + 367*mon/12 + day) + year*365 - 719499 - )*24 + hour /* now have hours */ + )*24 + hour /* now have hours - midnight tomorrow handled here */ )*60 + min /* now have minutes */ )*60 + sec; /* finally seconds */ } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 0d4cc7601df7..479d25cd3d4f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -233,6 +233,7 @@ static void tk_setup_internals(struct timekeeper *tk, struct clocksource *clock) u64 tmp, ntpinterval; struct clocksource *old_clock; + ++tk->cs_was_changed_seq; old_clock = tk->tkr_mono.clock; tk->tkr_mono.clock = clock; tk->tkr_mono.read = clock->read; @@ -298,13 +299,11 @@ u32 (*arch_gettimeoffset)(void) = default_arch_gettimeoffset; static inline u32 arch_gettimeoffset(void) { return 0; } #endif -static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) +static inline s64 timekeeping_delta_to_ns(struct tk_read_base *tkr, + cycle_t delta) { - cycle_t delta; s64 nsec; - delta = timekeeping_get_delta(tkr); - nsec = delta * tkr->mult + tkr->xtime_nsec; nsec >>= tkr->shift; @@ -312,6 +311,24 @@ static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) return nsec + arch_gettimeoffset(); } +static inline s64 timekeeping_get_ns(struct tk_read_base *tkr) +{ + cycle_t delta; + + delta = timekeeping_get_delta(tkr); + return timekeeping_delta_to_ns(tkr, delta); +} + +static inline s64 timekeeping_cycles_to_ns(struct tk_read_base *tkr, + cycle_t cycles) +{ + cycle_t delta; + + /* calculate the delta since the last update_wall_time */ + delta = clocksource_delta(cycles, tkr->cycle_last, tkr->mask); + return timekeeping_delta_to_ns(tkr, delta); +} + /** * update_fast_timekeeper - Update the fast and NMI safe monotonic timekeeper. * @tkr: Timekeeping readout base from which we take the update @@ -846,43 +863,274 @@ time64_t ktime_get_real_seconds(void) } EXPORT_SYMBOL_GPL(ktime_get_real_seconds); -#ifdef CONFIG_NTP_PPS +/** + * __ktime_get_real_seconds - The same as ktime_get_real_seconds + * but without the sequence counter protect. This internal function + * is called just when timekeeping lock is already held. + */ +time64_t __ktime_get_real_seconds(void) +{ + struct timekeeper *tk = &tk_core.timekeeper; + + return tk->xtime_sec; +} /** - * ktime_get_raw_and_real_ts64 - get day and raw monotonic time in timespec format - * @ts_raw: pointer to the timespec to be set to raw monotonic time - * @ts_real: pointer to the timespec to be set to the time of day - * - * This function reads both the time of day and raw monotonic time at the - * same time atomically and stores the resulting timestamps in timespec - * format. + * ktime_get_snapshot - snapshots the realtime/monotonic raw clocks with counter + * @systime_snapshot: pointer to struct receiving the system time snapshot */ -void ktime_get_raw_and_real_ts64(struct timespec64 *ts_raw, struct timespec64 *ts_real) +void ktime_get_snapshot(struct system_time_snapshot *systime_snapshot) { struct timekeeper *tk = &tk_core.timekeeper; unsigned long seq; - s64 nsecs_raw, nsecs_real; + ktime_t base_raw; + ktime_t base_real; + s64 nsec_raw; + s64 nsec_real; + cycle_t now; WARN_ON_ONCE(timekeeping_suspended); do { seq = read_seqcount_begin(&tk_core.seq); - *ts_raw = tk->raw_time; - ts_real->tv_sec = tk->xtime_sec; - ts_real->tv_nsec = 0; + now = tk->tkr_mono.read(tk->tkr_mono.clock); + systime_snapshot->cs_was_changed_seq = tk->cs_was_changed_seq; + systime_snapshot->clock_was_set_seq = tk->clock_was_set_seq; + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, now); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, now); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + systime_snapshot->cycles = now; + systime_snapshot->real = ktime_add_ns(base_real, nsec_real); + systime_snapshot->raw = ktime_add_ns(base_raw, nsec_raw); +} +EXPORT_SYMBOL_GPL(ktime_get_snapshot); - nsecs_raw = timekeeping_get_ns(&tk->tkr_raw); - nsecs_real = timekeeping_get_ns(&tk->tkr_mono); +/* Scale base by mult/div checking for overflow */ +static int scale64_check_overflow(u64 mult, u64 div, u64 *base) +{ + u64 tmp, rem; - } while (read_seqcount_retry(&tk_core.seq, seq)); + tmp = div64_u64_rem(*base, div, &rem); + + if (((int)sizeof(u64)*8 - fls64(mult) < fls64(tmp)) || + ((int)sizeof(u64)*8 - fls64(mult) < fls64(rem))) + return -EOVERFLOW; + tmp *= mult; + rem *= mult; + + do_div(rem, div); + *base = tmp + rem; + return 0; +} + +/** + * adjust_historical_crosststamp - adjust crosstimestamp previous to current interval + * @history: Snapshot representing start of history + * @partial_history_cycles: Cycle offset into history (fractional part) + * @total_history_cycles: Total history length in cycles + * @discontinuity: True indicates clock was set on history period + * @ts: Cross timestamp that should be adjusted using + * partial/total ratio + * + * Helper function used by get_device_system_crosststamp() to correct the + * crosstimestamp corresponding to the start of the current interval to the + * system counter value (timestamp point) provided by the driver. The + * total_history_* quantities are the total history starting at the provided + * reference point and ending at the start of the current interval. The cycle + * count between the driver timestamp point and the start of the current + * interval is partial_history_cycles. + */ +static int adjust_historical_crosststamp(struct system_time_snapshot *history, + cycle_t partial_history_cycles, + cycle_t total_history_cycles, + bool discontinuity, + struct system_device_crosststamp *ts) +{ + struct timekeeper *tk = &tk_core.timekeeper; + u64 corr_raw, corr_real; + bool interp_forward; + int ret; + + if (total_history_cycles == 0 || partial_history_cycles == 0) + return 0; + + /* Interpolate shortest distance from beginning or end of history */ + interp_forward = partial_history_cycles > total_history_cycles/2 ? + true : false; + partial_history_cycles = interp_forward ? + total_history_cycles - partial_history_cycles : + partial_history_cycles; + + /* + * Scale the monotonic raw time delta by: + * partial_history_cycles / total_history_cycles + */ + corr_raw = (u64)ktime_to_ns( + ktime_sub(ts->sys_monoraw, history->raw)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_raw); + if (ret) + return ret; + + /* + * If there is a discontinuity in the history, scale monotonic raw + * correction by: + * mult(real)/mult(raw) yielding the realtime correction + * Otherwise, calculate the realtime correction similar to monotonic + * raw calculation + */ + if (discontinuity) { + corr_real = mul_u64_u32_div + (corr_raw, tk->tkr_mono.mult, tk->tkr_raw.mult); + } else { + corr_real = (u64)ktime_to_ns( + ktime_sub(ts->sys_realtime, history->real)); + ret = scale64_check_overflow(partial_history_cycles, + total_history_cycles, &corr_real); + if (ret) + return ret; + } + + /* Fixup monotonic raw and real time time values */ + if (interp_forward) { + ts->sys_monoraw = ktime_add_ns(history->raw, corr_raw); + ts->sys_realtime = ktime_add_ns(history->real, corr_real); + } else { + ts->sys_monoraw = ktime_sub_ns(ts->sys_monoraw, corr_raw); + ts->sys_realtime = ktime_sub_ns(ts->sys_realtime, corr_real); + } + + return 0; +} - timespec64_add_ns(ts_raw, nsecs_raw); - timespec64_add_ns(ts_real, nsecs_real); +/* + * cycle_between - true if test occurs chronologically between before and after + */ +static bool cycle_between(cycle_t before, cycle_t test, cycle_t after) +{ + if (test > before && test < after) + return true; + if (test < before && before > after) + return true; + return false; } -EXPORT_SYMBOL(ktime_get_raw_and_real_ts64); -#endif /* CONFIG_NTP_PPS */ +/** + * get_device_system_crosststamp - Synchronously capture system/device timestamp + * @get_time_fn: Callback to get simultaneous device time and + * system counter from the device driver + * @ctx: Context passed to get_time_fn() + * @history_begin: Historical reference point used to interpolate system + * time when counter provided by the driver is before the current interval + * @xtstamp: Receives simultaneously captured system and device time + * + * Reads a timestamp from a device and correlates it to system time + */ +int get_device_system_crosststamp(int (*get_time_fn) + (ktime_t *device_time, + struct system_counterval_t *sys_counterval, + void *ctx), + void *ctx, + struct system_time_snapshot *history_begin, + struct system_device_crosststamp *xtstamp) +{ + struct system_counterval_t system_counterval; + struct timekeeper *tk = &tk_core.timekeeper; + cycle_t cycles, now, interval_start; + unsigned int clock_was_set_seq = 0; + ktime_t base_real, base_raw; + s64 nsec_real, nsec_raw; + u8 cs_was_changed_seq; + unsigned long seq; + bool do_interp; + int ret; + + do { + seq = read_seqcount_begin(&tk_core.seq); + /* + * Try to synchronously capture device time and a system + * counter value calling back into the device driver + */ + ret = get_time_fn(&xtstamp->device, &system_counterval, ctx); + if (ret) + return ret; + + /* + * Verify that the clocksource associated with the captured + * system counter value is the same as the currently installed + * timekeeper clocksource + */ + if (tk->tkr_mono.clock != system_counterval.cs) + return -ENODEV; + cycles = system_counterval.cycles; + + /* + * Check whether the system counter value provided by the + * device driver is on the current timekeeping interval. + */ + now = tk->tkr_mono.read(tk->tkr_mono.clock); + interval_start = tk->tkr_mono.cycle_last; + if (!cycle_between(interval_start, cycles, now)) { + clock_was_set_seq = tk->clock_was_set_seq; + cs_was_changed_seq = tk->cs_was_changed_seq; + cycles = interval_start; + do_interp = true; + } else { + do_interp = false; + } + + base_real = ktime_add(tk->tkr_mono.base, + tk_core.timekeeper.offs_real); + base_raw = tk->tkr_raw.base; + + nsec_real = timekeeping_cycles_to_ns(&tk->tkr_mono, + system_counterval.cycles); + nsec_raw = timekeeping_cycles_to_ns(&tk->tkr_raw, + system_counterval.cycles); + } while (read_seqcount_retry(&tk_core.seq, seq)); + + xtstamp->sys_realtime = ktime_add_ns(base_real, nsec_real); + xtstamp->sys_monoraw = ktime_add_ns(base_raw, nsec_raw); + + /* + * Interpolate if necessary, adjusting back from the start of the + * current interval + */ + if (do_interp) { + cycle_t partial_history_cycles, total_history_cycles; + bool discontinuity; + + /* + * Check that the counter value occurs after the provided + * history reference and that the history doesn't cross a + * clocksource change + */ + if (!history_begin || + !cycle_between(history_begin->cycles, + system_counterval.cycles, cycles) || + history_begin->cs_was_changed_seq != cs_was_changed_seq) + return -EINVAL; + partial_history_cycles = cycles - system_counterval.cycles; + total_history_cycles = cycles - history_begin->cycles; + discontinuity = + history_begin->clock_was_set_seq != clock_was_set_seq; + + ret = adjust_historical_crosststamp(history_begin, + partial_history_cycles, + total_history_cycles, + discontinuity, xtstamp); + if (ret) + return ret; + } + + return 0; +} +EXPORT_SYMBOL_GPL(get_device_system_crosststamp); /** * do_gettimeofday - Returns the time of day in a timeval @@ -959,7 +1207,7 @@ int timekeeping_inject_offset(struct timespec *ts) struct timespec64 ts64, tmp; int ret = 0; - if ((unsigned long)ts->tv_nsec >= NSEC_PER_SEC) + if (!timespec_inject_offset_valid(ts)) return -EINVAL; ts64 = timespec_to_timespec64(*ts); @@ -1592,9 +1840,12 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, { s64 interval = tk->cycle_interval; s64 xinterval = tk->xtime_interval; + u32 base = tk->tkr_mono.clock->mult; + u32 max = tk->tkr_mono.clock->maxadj; + u32 cur_adj = tk->tkr_mono.mult; s64 tick_error; bool negative; - u32 adj; + u32 adj_scale; /* Remove any current error adj from freq calculation */ if (tk->ntp_err_mult) @@ -1613,13 +1864,33 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, /* preserve the direction of correction */ negative = (tick_error < 0); - /* Sort out the magnitude of the correction */ - tick_error = abs64(tick_error); - for (adj = 0; tick_error > interval; adj++) + /* If any adjustment would pass the max, just return */ + if (negative && (cur_adj - 1) <= (base - max)) + return; + if (!negative && (cur_adj + 1) >= (base + max)) + return; + /* + * Sort out the magnitude of the correction, but + * avoid making so large a correction that we go + * over the max adjustment. + */ + adj_scale = 0; + tick_error = abs(tick_error); + while (tick_error > interval) { + u32 adj = 1 << (adj_scale + 1); + + /* Check if adjustment gets us within 1 unit from the max */ + if (negative && (cur_adj - adj) <= (base - max)) + break; + if (!negative && (cur_adj + adj) >= (base + max)) + break; + + adj_scale++; tick_error >>= 1; + } /* scale the corrections */ - timekeeping_apply_adjustment(tk, offset, negative, adj); + timekeeping_apply_adjustment(tk, offset, negative, adj_scale); } /* diff --git a/kernel/time/timekeeping_internal.h b/kernel/time/timekeeping_internal.h index 4ea005a7f9da..5be76270ec4a 100644 --- a/kernel/time/timekeeping_internal.h +++ b/kernel/time/timekeeping_internal.h @@ -17,7 +17,11 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) { cycle_t ret = (now - last) & mask; - return (s64) ret > 0 ? ret : 0; + /* + * Prevent time going backwards by checking the MSB of mask in + * the result. If set, return 0. + */ + return ret & ~(mask >> 1) ? 0 : ret; } #else static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) @@ -26,4 +30,6 @@ static inline cycle_t clocksource_delta(cycle_t now, cycle_t last, cycle_t mask) } #endif +extern time64_t __ktime_get_real_seconds(void); + #endif /* _TIMEKEEPING_INTERNAL_H */ diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 74591ba9474f..73164c3aa56b 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -977,13 +977,29 @@ EXPORT_SYMBOL(add_timer); */ void add_timer_on(struct timer_list *timer, int cpu) { - struct tvec_base *base = per_cpu_ptr(&tvec_bases, cpu); + struct tvec_base *new_base = per_cpu_ptr(&tvec_bases, cpu); + struct tvec_base *base; unsigned long flags; timer_stats_timer_set_start_info(timer); BUG_ON(timer_pending(timer) || !timer->function); - spin_lock_irqsave(&base->lock, flags); - timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu; + + /* + * If @timer was on a different CPU, it should be migrated with the + * old base locked to prevent other operations proceeding with the + * wrong base locked. See lock_timer_base(). + */ + base = lock_timer_base(timer, &flags); + if (base != new_base) { + timer->flags |= TIMER_MIGRATING; + + spin_unlock(&base->lock); + base = new_base; + spin_lock(&base->lock); + WRITE_ONCE(timer->flags, + (timer->flags & ~TIMER_BASEMASK) | cpu); + } + debug_activate(timer, timer->expires); internal_add_timer(base, timer); spin_unlock_irqrestore(&base->lock, flags); @@ -1550,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { @@ -1682,10 +1709,10 @@ EXPORT_SYMBOL(msleep_interruptible); static void __sched do_usleep_range(unsigned long min, unsigned long max) { ktime_t kmin; - unsigned long delta; + u64 delta; kmin = ktime_set(0, min * NSEC_PER_USEC); - delta = (max - min) * NSEC_PER_USEC; + delta = (u64)(max - min) * NSEC_PER_USEC; schedule_hrtimeout_range(&kmin, delta, HRTIMER_MODE_REL); } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index f75e35b60149..ba7d8b288bb3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); + SEQ_printf(m, ", S:%02x", timer->state); #ifdef CONFIG_TIMER_STATS SEQ_printf(m, ", "); print_name_offset(m, timer->start_site); diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8d6363f42169..e45db6b0d878 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -434,7 +434,7 @@ config UPROBE_EVENT config BPF_EVENTS depends on BPF_SYSCALL - depends on KPROBE_EVENT || UPROBE_EVENT + depends on (KPROBE_EVENT || UPROBE_EVENT) && PERF_EVENTS bool default y help diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index a990824c8604..f94e7a21f52d 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -349,16 +349,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer, if (count >= BLK_TN_MAX_MSG) return -EINVAL; - msg = kmalloc(count + 1, GFP_KERNEL); - if (msg == NULL) - return -ENOMEM; - - if (copy_from_user(msg, buffer, count)) { - kfree(msg); - return -EFAULT; - } + msg = memdup_user_nul(buffer, count); + if (IS_ERR(msg)) + return PTR_ERR(msg); - msg[count] = '\0'; bt = filp->private_data; __trace_note_message(bt, "%s", msg); kfree(msg); @@ -1443,12 +1437,12 @@ static struct trace_event trace_blk_event = { static int __init init_blk_tracer(void) { if (!register_trace_event(&trace_blk_event)) { - pr_warning("Warning: could not register block events\n"); + pr_warn("Warning: could not register block events\n"); return 1; } if (register_tracer(&blk_tracer) != 0) { - pr_warning("Warning: could not register the block tracer\n"); + pr_warn("Warning: could not register the block tracer\n"); unregister_trace_event(&trace_blk_event); return 1; } diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 4228fd3682c3..3e4ffb3ace5f 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -13,8 +13,6 @@ #include <linux/ctype.h> #include "trace.h" -static DEFINE_PER_CPU(int, bpf_prog_active); - /** * trace_call_bpf - invoke BPF program * @prog: BPF program @@ -191,14 +189,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; + struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (!event) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -228,6 +229,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct perf_event *event; + struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -236,10 +238,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (unlikely(!event)) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; @@ -293,6 +297,8 @@ static const struct bpf_func_proto *kprobe_prog_func_proto(enum bpf_func_id func return &bpf_perf_event_read_proto; case BPF_FUNC_perf_event_output: return &bpf_perf_event_output_proto; + case BPF_FUNC_get_stackid: + return &bpf_get_stackid_proto; default: return NULL; } @@ -316,7 +322,7 @@ static bool kprobe_prog_is_valid_access(int off, int size, enum bpf_access_type return true; } -static struct bpf_verifier_ops kprobe_prog_ops = { +static const struct bpf_verifier_ops kprobe_prog_ops = { .get_func_proto = kprobe_prog_func_proto, .is_valid_access = kprobe_prog_is_valid_access, }; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3f743b147247..b1870fbd2b67 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -62,8 +62,6 @@ #define FTRACE_HASH_DEFAULT_BITS 10 #define FTRACE_HASH_MAX_BITS 12 -#define FL_GLOBAL_CONTROL_MASK (FTRACE_OPS_FL_CONTROL) - #ifdef CONFIG_DYNAMIC_FTRACE #define INIT_OPS_HASH(opsname) \ .func_hash = &opsname.local_hash, \ @@ -113,14 +111,9 @@ static int ftrace_disabled __read_mostly; static DEFINE_MUTEX(ftrace_lock); -static struct ftrace_ops *ftrace_control_list __read_mostly = &ftrace_list_end; static struct ftrace_ops *ftrace_ops_list __read_mostly = &ftrace_list_end; ftrace_func_t ftrace_trace_function __read_mostly = ftrace_stub; static struct ftrace_ops global_ops; -static struct ftrace_ops control_ops; - -static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs); #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -203,7 +196,7 @@ void clear_ftrace_function(void) ftrace_trace_function = ftrace_stub; } -static void control_ops_disable_all(struct ftrace_ops *ops) +static void per_cpu_ops_disable_all(struct ftrace_ops *ops) { int cpu; @@ -211,16 +204,19 @@ static void control_ops_disable_all(struct ftrace_ops *ops) *per_cpu_ptr(ops->disabled, cpu) = 1; } -static int control_ops_alloc(struct ftrace_ops *ops) +static int per_cpu_ops_alloc(struct ftrace_ops *ops) { int __percpu *disabled; + if (WARN_ON_ONCE(!(ops->flags & FTRACE_OPS_FL_PER_CPU))) + return -EINVAL; + disabled = alloc_percpu(int); if (!disabled) return -ENOMEM; ops->disabled = disabled; - control_ops_disable_all(ops); + per_cpu_ops_disable_all(ops); return 0; } @@ -256,10 +252,11 @@ static inline void update_function_graph_func(void) { } static ftrace_func_t ftrace_ops_get_list_func(struct ftrace_ops *ops) { /* - * If this is a dynamic ops or we force list func, + * If this is a dynamic, RCU, or per CPU ops, or we force list func, * then it needs to call the list anyway. */ - if (ops->flags & FTRACE_OPS_FL_DYNAMIC || FTRACE_FORCE_LIST_FUNC) + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU | + FTRACE_OPS_FL_RCU) || FTRACE_FORCE_LIST_FUNC) return ftrace_ops_list_func; return ftrace_ops_get_func(ops); @@ -383,26 +380,6 @@ static int remove_ftrace_ops(struct ftrace_ops **list, struct ftrace_ops *ops) return 0; } -static void add_ftrace_list_ops(struct ftrace_ops **list, - struct ftrace_ops *main_ops, - struct ftrace_ops *ops) -{ - int first = *list == &ftrace_list_end; - add_ftrace_ops(list, ops); - if (first) - add_ftrace_ops(&ftrace_ops_list, main_ops); -} - -static int remove_ftrace_list_ops(struct ftrace_ops **list, - struct ftrace_ops *main_ops, - struct ftrace_ops *ops) -{ - int ret = remove_ftrace_ops(list, ops); - if (!ret && *list == &ftrace_list_end) - ret = remove_ftrace_ops(&ftrace_ops_list, main_ops); - return ret; -} - static void ftrace_update_trampoline(struct ftrace_ops *ops); static int __register_ftrace_function(struct ftrace_ops *ops) @@ -430,14 +407,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops) if (!core_kernel_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; - if (ops->flags & FTRACE_OPS_FL_CONTROL) { - if (control_ops_alloc(ops)) + if (ops->flags & FTRACE_OPS_FL_PER_CPU) { + if (per_cpu_ops_alloc(ops)) return -ENOMEM; - add_ftrace_list_ops(&ftrace_control_list, &control_ops, ops); - /* The control_ops needs the trampoline update */ - ops = &control_ops; - } else - add_ftrace_ops(&ftrace_ops_list, ops); + } + + add_ftrace_ops(&ftrace_ops_list, ops); /* Always save the function, and reset at unregistering */ ops->saved_func = ops->func; @@ -460,11 +435,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops) if (WARN_ON(!(ops->flags & FTRACE_OPS_FL_ENABLED))) return -EBUSY; - if (ops->flags & FTRACE_OPS_FL_CONTROL) { - ret = remove_ftrace_list_ops(&ftrace_control_list, - &control_ops, ops); - } else - ret = remove_ftrace_ops(&ftrace_ops_list, ops); + ret = remove_ftrace_ops(&ftrace_ops_list, ops); if (ret < 0) return ret; @@ -1059,8 +1030,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) for_each_possible_cpu(cpu) { stat = &per_cpu(ftrace_profile_stats, cpu); - /* allocate enough for function name + cpu number */ - name = kmalloc(32, GFP_KERNEL); + name = kasprintf(GFP_KERNEL, "function%d", cpu); if (!name) { /* * The files created are permanent, if something happens @@ -1072,7 +1042,6 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) return; } stat->stat = function_stats; - snprintf(name, 32, "function%d", cpu); stat->stat.name = name; ret = register_stat_tracer(&stat->stat); if (ret) { @@ -1087,8 +1056,7 @@ static __init void ftrace_profile_tracefs(struct dentry *d_tracer) entry = tracefs_create_file("function_profile_enabled", 0644, d_tracer, NULL, &ftrace_profile_fops); if (!entry) - pr_warning("Could not create tracefs " - "'function_profile_enabled' entry\n"); + pr_warn("Could not create tracefs 'function_profile_enabled' entry\n"); } #else /* CONFIG_FUNCTION_PROFILER */ @@ -1639,7 +1607,7 @@ static bool test_rec_ops_needs_regs(struct dyn_ftrace *rec) return keep_regs; } -static void __ftrace_hash_rec_update(struct ftrace_ops *ops, +static bool __ftrace_hash_rec_update(struct ftrace_ops *ops, int filter_hash, bool inc) { @@ -1647,12 +1615,13 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, struct ftrace_hash *other_hash; struct ftrace_page *pg; struct dyn_ftrace *rec; + bool update = false; int count = 0; int all = 0; /* Only update if the ops has been registered */ if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) - return; + return false; /* * In the filter_hash case: @@ -1679,7 +1648,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, * then there's nothing to do. */ if (ftrace_hash_empty(hash)) - return; + return false; } do_for_each_ftrace_rec(pg, rec) { @@ -1687,6 +1656,9 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, int in_hash = 0; int match = 0; + if (rec->flags & FTRACE_FL_DISABLED) + continue; + if (all) { /* * Only the filter_hash affects all records. @@ -1720,7 +1692,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, if (inc) { rec->flags++; if (FTRACE_WARN_ON(ftrace_rec_count(rec) == FTRACE_REF_MAX)) - return; + return false; /* * If there's only a single callback registered to a @@ -1746,7 +1718,7 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, rec->flags |= FTRACE_FL_REGS; } else { if (FTRACE_WARN_ON(ftrace_rec_count(rec) == 0)) - return; + return false; rec->flags--; /* @@ -1779,22 +1751,28 @@ static void __ftrace_hash_rec_update(struct ftrace_ops *ops, */ } count++; + + /* Must match FTRACE_UPDATE_CALLS in ftrace_modify_all_code() */ + update |= ftrace_test_record(rec, 1) != FTRACE_UPDATE_IGNORE; + /* Shortcut, if we handled all records, we are done. */ if (!all && count == hash->count) - return; + return update; } while_for_each_ftrace_rec(); + + return update; } -static void ftrace_hash_rec_disable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_disable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 0); + return __ftrace_hash_rec_update(ops, filter_hash, 0); } -static void ftrace_hash_rec_enable(struct ftrace_ops *ops, +static bool ftrace_hash_rec_enable(struct ftrace_ops *ops, int filter_hash) { - __ftrace_hash_rec_update(ops, filter_hash, 1); + return __ftrace_hash_rec_update(ops, filter_hash, 1); } static void ftrace_hash_rec_update_modify(struct ftrace_ops *ops, @@ -1940,7 +1918,7 @@ static int ftrace_hash_ipmodify_update(struct ftrace_ops *ops, return __ftrace_hash_update_ipmodify(ops, old_hash, new_hash); } -static void print_ip_ins(const char *fmt, unsigned char *p) +static void print_ip_ins(const char *fmt, const unsigned char *p) { int i; @@ -1952,6 +1930,31 @@ static void print_ip_ins(const char *fmt, unsigned char *p) static struct ftrace_ops * ftrace_find_tramp_ops_any(struct dyn_ftrace *rec); +static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, struct ftrace_ops *ops); + +enum ftrace_bug_type ftrace_bug_type; +const void *ftrace_expected; + +static void print_bug_type(void) +{ + switch (ftrace_bug_type) { + case FTRACE_BUG_UNKNOWN: + break; + case FTRACE_BUG_INIT: + pr_info("Initializing ftrace call sites\n"); + break; + case FTRACE_BUG_NOP: + pr_info("Setting ftrace call site to NOP\n"); + break; + case FTRACE_BUG_CALL: + pr_info("Setting ftrace call site to call ftrace function\n"); + break; + case FTRACE_BUG_UPDATE: + pr_info("Updating ftrace call site to call a different ftrace function\n"); + break; + } +} /** * ftrace_bug - report and shutdown function tracer @@ -1979,8 +1982,12 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) FTRACE_WARN_ON_ONCE(1); pr_info("ftrace failed to modify "); print_ip_sym(ip); - print_ip_ins(" actual: ", (unsigned char *)ip); + print_ip_ins(" actual: ", (unsigned char *)ip); pr_cont("\n"); + if (ftrace_expected) { + print_ip_ins(" expected: ", ftrace_expected); + pr_cont("\n"); + } break; case -EPERM: FTRACE_WARN_ON_ONCE(1); @@ -1992,6 +1999,7 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) pr_info("ftrace faulted on unknown error "); print_ip_sym(ip); } + print_bug_type(); if (rec) { struct ftrace_ops *ops = NULL; @@ -2000,15 +2008,19 @@ void ftrace_bug(int failed, struct dyn_ftrace *rec) rec->flags & FTRACE_FL_REGS ? " R" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); - if (ops) - pr_cont("\ttramp: %pS", - (void *)ops->trampoline); - else + if (ops) { + do { + pr_cont("\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else pr_cont("\ttramp: ERROR!"); } ip = ftrace_get_addr_curr(rec); - pr_cont(" expected tramp: %lx\n", ip); + pr_cont("\n expected tramp: %lx\n", ip); } } @@ -2016,6 +2028,11 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) { unsigned long flag = 0UL; + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + + if (rec->flags & FTRACE_FL_DISABLED) + return FTRACE_UPDATE_IGNORE; + /* * If we are updating calls: * @@ -2077,9 +2094,12 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) * from the save regs, to a non-save regs function or * vice versa, or from a trampoline call. */ - if (flag & FTRACE_FL_ENABLED) + if (flag & FTRACE_FL_ENABLED) { + ftrace_bug_type = FTRACE_BUG_CALL; return FTRACE_UPDATE_MAKE_CALL; + } + ftrace_bug_type = FTRACE_BUG_UPDATE; return FTRACE_UPDATE_MODIFY_CALL; } @@ -2096,6 +2116,7 @@ static int ftrace_check_record(struct dyn_ftrace *rec, int enable, int update) FTRACE_FL_REGS_EN); } + ftrace_bug_type = FTRACE_BUG_NOP; return FTRACE_UPDATE_MAKE_NOP; } @@ -2145,6 +2166,24 @@ ftrace_find_tramp_ops_any(struct dyn_ftrace *rec) } static struct ftrace_ops * +ftrace_find_tramp_ops_next(struct dyn_ftrace *rec, + struct ftrace_ops *op) +{ + unsigned long ip = rec->ip; + + while_for_each_ftrace_op(op) { + + if (!op->trampoline) + continue; + + if (hash_contains_ip(ip, op->func_hash)) + return op; + } + + return NULL; +} + +static struct ftrace_ops * ftrace_find_tramp_ops_curr(struct dyn_ftrace *rec) { struct ftrace_ops *op; @@ -2279,8 +2318,8 @@ unsigned long ftrace_get_addr_curr(struct dyn_ftrace *rec) if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_curr(rec); if (FTRACE_WARN_ON(!ops)) { - pr_warning("Bad trampoline accounting at: %p (%pS)\n", - (void *)rec->ip, (void *)rec->ip); + pr_warn("Bad trampoline accounting at: %p (%pS)\n", + (void *)rec->ip, (void *)rec->ip); /* Ftrace is shutting down, return anything */ return (unsigned long)FTRACE_ADDR; } @@ -2307,17 +2346,22 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable) ret = ftrace_update_record(rec, enable); + ftrace_bug_type = FTRACE_BUG_UNKNOWN; + switch (ret) { case FTRACE_UPDATE_IGNORE: return 0; case FTRACE_UPDATE_MAKE_CALL: + ftrace_bug_type = FTRACE_BUG_CALL; return ftrace_make_call(rec, ftrace_addr); case FTRACE_UPDATE_MAKE_NOP: + ftrace_bug_type = FTRACE_BUG_NOP; return ftrace_make_nop(NULL, rec, ftrace_old_addr); case FTRACE_UPDATE_MODIFY_CALL: + ftrace_bug_type = FTRACE_BUG_UPDATE; return ftrace_modify_call(rec, ftrace_old_addr, ftrace_addr); } @@ -2425,6 +2469,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec) ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR); if (ret) { + ftrace_bug_type = FTRACE_BUG_INIT; ftrace_bug(ret, rec); return 0; } @@ -2566,7 +2611,7 @@ void __weak arch_ftrace_trampoline_free(struct ftrace_ops *ops) { } -static void control_ops_free(struct ftrace_ops *ops) +static void per_cpu_ops_free(struct ftrace_ops *ops) { free_percpu(ops->disabled); } @@ -2603,7 +2648,6 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; ftrace_start_up++; - command |= FTRACE_UPDATE_CALLS; /* * Note that ftrace probes uses this to start up @@ -2624,7 +2668,8 @@ static int ftrace_startup(struct ftrace_ops *ops, int command) return ret; } - ftrace_hash_rec_enable(ops, 1); + if (ftrace_hash_rec_enable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; ftrace_startup_enable(command); @@ -2654,11 +2699,11 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* Disabling ipmodify never fails */ ftrace_hash_ipmodify_disable(ops); - ftrace_hash_rec_disable(ops, 1); - ops->flags &= ~FTRACE_OPS_FL_ENABLED; + if (ftrace_hash_rec_disable(ops, 1)) + command |= FTRACE_UPDATE_CALLS; - command |= FTRACE_UPDATE_CALLS; + ops->flags &= ~FTRACE_OPS_FL_ENABLED; if (saved_ftrace_func != ftrace_trace_function) { saved_ftrace_func = ftrace_trace_function; @@ -2667,13 +2712,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are control ops, they still need their + * If these are per_cpu ops, they still need their * per_cpu field freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & FTRACE_OPS_FL_PER_CPU) + per_cpu_ops_free(ops); return 0; } @@ -2714,7 +2759,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) /* * Dynamic ops may be freed, we must make sure that all * callers are done before leaving this function. - * The same goes for freeing the per_cpu data of the control + * The same goes for freeing the per_cpu data of the per_cpu * ops. * * Again, normal synchronize_sched() is not good enough. @@ -2725,13 +2770,13 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) * infrastructure to do the synchronization, thus we must do it * ourselves. */ - if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_PER_CPU)) { schedule_on_each_cpu(ftrace_sync); arch_ftrace_trampoline_free(ops); - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & FTRACE_OPS_FL_PER_CPU) + per_cpu_ops_free(ops); } return 0; @@ -2798,9 +2843,9 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) if (!(ops->flags & FTRACE_OPS_FL_ENABLED)) return 0; - /* If ops traces all mods, we already accounted for it */ + /* If ops traces all then it includes this function */ if (ops_traces_mod(ops)) - return 0; + return 1; /* The function must be in the filter */ if (!ftrace_hash_empty(ops->func_hash->filter_hash) && @@ -2814,64 +2859,41 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec) return 1; } -static int referenced_filters(struct dyn_ftrace *rec) -{ - struct ftrace_ops *ops; - int cnt = 0; - - for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { - if (ops_references_rec(ops, rec)) - cnt++; - } - - return cnt; -} - static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) { struct ftrace_page *pg; struct dyn_ftrace *p; cycle_t start, stop; unsigned long update_cnt = 0; - unsigned long ref = 0; - bool test = false; + unsigned long rec_flags = 0; int i; + start = ftrace_now(raw_smp_processor_id()); + /* - * When adding a module, we need to check if tracers are - * currently enabled and if they are set to trace all functions. - * If they are, we need to enable the module functions as well - * as update the reference counts for those function records. + * When a module is loaded, this function is called to convert + * the calls to mcount in its text to nops, and also to create + * an entry in the ftrace data. Now, if ftrace is activated + * after this call, but before the module sets its text to + * read-only, the modification of enabling ftrace can fail if + * the read-only is done while ftrace is converting the calls. + * To prevent this, the module's records are set as disabled + * and will be enabled after the call to set the module's text + * to read-only. */ - if (mod) { - struct ftrace_ops *ops; - - for (ops = ftrace_ops_list; - ops != &ftrace_list_end; ops = ops->next) { - if (ops->flags & FTRACE_OPS_FL_ENABLED) { - if (ops_traces_mod(ops)) - ref++; - else - test = true; - } - } - } - - start = ftrace_now(raw_smp_processor_id()); + if (mod) + rec_flags |= FTRACE_FL_DISABLED; for (pg = new_pgs; pg; pg = pg->next) { for (i = 0; i < pg->index; i++) { - int cnt = ref; /* If something went wrong, bail without enabling anything */ if (unlikely(ftrace_disabled)) return -1; p = &pg->records[i]; - if (test) - cnt += referenced_filters(p); - p->flags = cnt; + p->flags = rec_flags; /* * Do the initial record conversion from mcount jump @@ -2881,21 +2903,6 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs) break; update_cnt++; - - /* - * If the tracing is enabled, go ahead and enable the record. - * - * The reason not to enable the record immediatelly is the - * inherent check of ftrace_make_nop/ftrace_make_call for - * correct previous instructions. Making first the NOP - * conversion puts the module to the correct state, thus - * passing the ftrace_make_call check. - */ - if (ftrace_start_up && cnt) { - int failed = __ftrace_replace_code(p, 1); - if (failed) - ftrace_bug(failed, p); - } } } @@ -3258,7 +3265,7 @@ static int t_show(struct seq_file *m, void *v) seq_printf(m, "%ps", (void *)rec->ip); if (iter->flags & FTRACE_ITER_ENABLED) { - struct ftrace_ops *ops = NULL; + struct ftrace_ops *ops; seq_printf(m, " (%ld)%s%s", ftrace_rec_count(rec), @@ -3266,14 +3273,19 @@ static int t_show(struct seq_file *m, void *v) rec->flags & FTRACE_FL_IPMODIFY ? " I" : " "); if (rec->flags & FTRACE_FL_TRAMP_EN) { ops = ftrace_find_tramp_ops_any(rec); - if (ops) - seq_printf(m, "\ttramp: %pS", - (void *)ops->trampoline); - else + if (ops) { + do { + seq_printf(m, "\ttramp: %pS (%pS)", + (void *)ops->trampoline, + (void *)ops->func); + add_trampoline_func(m, ops, rec); + ops = ftrace_find_tramp_ops_next(rec, ops); + } while (ops); + } else seq_puts(m, "\ttramp: ERROR!"); - + } else { + add_trampoline_func(m, NULL, rec); } - add_trampoline_func(m, ops, rec); } seq_putc(m, '\n'); @@ -4898,6 +4910,19 @@ static int ftrace_process_locs(struct module *mod, #define next_to_ftrace_page(p) container_of(p, struct ftrace_page, next) +static int referenced_filters(struct dyn_ftrace *rec) +{ + struct ftrace_ops *ops; + int cnt = 0; + + for (ops = ftrace_ops_list; ops != &ftrace_list_end; ops = ops->next) { + if (ops_references_rec(ops, rec)) + cnt++; + } + + return cnt; +} + void ftrace_release_mod(struct module *mod) { struct dyn_ftrace *rec; @@ -4940,44 +4965,85 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +void ftrace_module_enable(struct module *mod) { - if (ftrace_disabled || start == end) - return; - ftrace_process_locs(mod, start, end); -} + struct dyn_ftrace *rec; + struct ftrace_page *pg; -void ftrace_module_init(struct module *mod) -{ - ftrace_init_module(mod, mod->ftrace_callsites, - mod->ftrace_callsites + - mod->num_ftrace_callsites); -} + mutex_lock(&ftrace_lock); -static int ftrace_module_notify_exit(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; + if (ftrace_disabled) + goto out_unlock; - if (val == MODULE_STATE_GOING) - ftrace_release_mod(mod); + /* + * If the tracing is enabled, go ahead and enable the record. + * + * The reason not to enable the record immediatelly is the + * inherent check of ftrace_make_nop/ftrace_make_call for + * correct previous instructions. Making first the NOP + * conversion puts the module to the correct state, thus + * passing the ftrace_make_call check. + * + * We also delay this to after the module code already set the + * text to read-only, as we now need to set it back to read-write + * so that we can modify the text. + */ + if (ftrace_start_up) + ftrace_arch_code_modify_prepare(); - return 0; + do_for_each_ftrace_rec(pg, rec) { + int cnt; + /* + * do_for_each_ftrace_rec() is a double loop. + * module text shares the pg. If a record is + * not part of this module, then skip this pg, + * which the "break" will do. + */ + if (!within_module_core(rec->ip, mod)) + break; + + cnt = 0; + + /* + * When adding a module, we need to check if tracers are + * currently enabled and if they are, and can trace this record, + * we need to enable the module functions as well as update the + * reference counts for those function records. + */ + if (ftrace_start_up) + cnt += referenced_filters(rec); + + /* This clears FTRACE_FL_DISABLED */ + rec->flags = cnt; + + if (ftrace_start_up && cnt) { + int failed = __ftrace_replace_code(rec, 1); + if (failed) { + ftrace_bug(failed, rec); + goto out_loop; + } + } + + } while_for_each_ftrace_rec(); + + out_loop: + if (ftrace_start_up) + ftrace_arch_code_modify_post_process(); + + out_unlock: + mutex_unlock(&ftrace_lock); } -#else -static int ftrace_module_notify_exit(struct notifier_block *self, - unsigned long val, void *data) + +void ftrace_module_init(struct module *mod) { - return 0; + if (ftrace_disabled || !mod->num_ftrace_callsites) + return; + + ftrace_process_locs(mod, mod->ftrace_callsites, + mod->ftrace_callsites + mod->num_ftrace_callsites); } #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_exit_nb = { - .notifier_call = ftrace_module_notify_exit, - .priority = INT_MIN, /* Run after anything that can remove kprobes */ -}; - void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5006,10 +5072,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_exit_nb); - if (ret) - pr_warning("Failed to register trace ftrace module exit notifier\n"); - set_ftrace_early_filters(); return; @@ -5116,44 +5178,6 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static void -ftrace_ops_control_func(unsigned long ip, unsigned long parent_ip, - struct ftrace_ops *op, struct pt_regs *regs) -{ - if (unlikely(trace_recursion_test(TRACE_CONTROL_BIT))) - return; - - /* - * Some of the ops may be dynamically allocated, - * they must be freed after a synchronize_sched(). - */ - preempt_disable_notrace(); - trace_recursion_set(TRACE_CONTROL_BIT); - - /* - * Control funcs (perf) uses RCU. Only trace if - * RCU is currently active. - */ - if (!rcu_is_watching()) - goto out; - - do_for_each_ftrace_op(op, ftrace_control_list) { - if (!(op->flags & FTRACE_OPS_FL_STUB) && - !ftrace_function_local_disabled(op) && - ftrace_ops_test(op, ip, regs)) - op->func(ip, parent_ip, op, regs); - } while_for_each_ftrace_op(op); - out: - trace_recursion_clear(TRACE_CONTROL_BIT); - preempt_enable_notrace(); -} - -static struct ftrace_ops control_ops = { - .func = ftrace_ops_control_func, - .flags = FTRACE_OPS_FL_RECURSION_SAFE | FTRACE_OPS_FL_INITIALIZED, - INIT_OPS_HASH(control_ops) -}; - static inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) @@ -5170,8 +5194,22 @@ __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, * they must be freed after a synchronize_sched(). */ preempt_disable_notrace(); + do_for_each_ftrace_op(op, ftrace_ops_list) { - if (ftrace_ops_test(op, ip, regs)) { + /* + * Check the following for each ops before calling their func: + * if RCU flag is set, then rcu_is_watching() must be true + * if PER_CPU is set, then ftrace_function_local_disable() + * must be false + * Otherwise test if the ip matches the ops filter + * + * If any of the above fails then the op->func() is not executed. + */ + if ((!(op->flags & FTRACE_OPS_FL_RCU) || rcu_is_watching()) && + (!(op->flags & FTRACE_OPS_FL_PER_CPU) || + !ftrace_function_local_disabled(op)) && + ftrace_ops_test(op, ip, regs)) { + if (FTRACE_WARN_ON(!op->func)) { pr_warn("op=%p %pS\n", op, op); goto out; @@ -5195,7 +5233,7 @@ out: * being NULL, or CONFIG_DYNAMIC_FTRACE_WITH_REGS. * Note, CONFIG_DYNAMIC_FTRACE_WITH_REGS expects a full regs to be saved. * An architecture can pass partial regs with ftrace_ops and still - * set the ARCH_SUPPORT_FTARCE_OPS. + * set the ARCH_SUPPORTS_FTRACE_OPS. */ #if ARCH_SUPPORTS_FTRACE_OPS static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, @@ -5212,20 +5250,29 @@ static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) /* * If there's only one function registered but it does not support - * recursion, this function will be called by the mcount trampoline. - * This function will handle recursion protection. + * recursion, needs RCU protection and/or requires per cpu handling, then + * this function will be called by the mcount trampoline. */ -static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, +static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *op, struct pt_regs *regs) { int bit; + if ((op->flags & FTRACE_OPS_FL_RCU) && !rcu_is_watching()) + return; + bit = trace_test_and_set_recursion(TRACE_LIST_START, TRACE_LIST_MAX); if (bit < 0) return; - op->func(ip, parent_ip, op, regs); + preempt_disable_notrace(); + if (!(op->flags & FTRACE_OPS_FL_PER_CPU) || + !ftrace_function_local_disabled(op)) { + op->func(ip, parent_ip, op, regs); + } + + preempt_enable_notrace(); trace_clear_recursion(bit); } @@ -5243,12 +5290,12 @@ static void ftrace_ops_recurs_func(unsigned long ip, unsigned long parent_ip, ftrace_func_t ftrace_ops_get_func(struct ftrace_ops *ops) { /* - * If the func handles its own recursion, call it directly. - * Otherwise call the recursion protected function that - * will call the ftrace ops function. + * If the function does not handle recursion, needs to be RCU safe, + * or does per cpu logic, then we need to call the assist handler. */ - if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE)) - return ftrace_ops_recurs_func; + if (!(ops->flags & FTRACE_OPS_FL_RECURSION_SAFE) || + ops->flags & (FTRACE_OPS_FL_RCU | FTRACE_OPS_FL_PER_CPU)) + return ftrace_ops_assist_func; return ops->func; } diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c index eb4220a132ec..81b87451c0ea 100644 --- a/kernel/trace/power-traces.c +++ b/kernel/trace/power-traces.c @@ -15,4 +15,5 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(suspend_resume); EXPORT_TRACEPOINT_SYMBOL_GPL(cpu_idle); +EXPORT_TRACEPOINT_SYMBOL_GPL(powernv_throttle); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 75f1d05ea82d..95181e36891a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1001,17 +1001,13 @@ static int rb_head_page_replace(struct buffer_page *old, /* * rb_tail_page_update - move the tail page forward - * - * Returns 1 if moved tail page, 0 if someone else did. */ -static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, +static void rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page *tail_page, struct buffer_page *next_page) { - struct buffer_page *old_tail; unsigned long old_entries; unsigned long old_write; - int ret = 0; /* * The tail page now needs to be moved forward. @@ -1036,7 +1032,7 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, * it is, then it is up to us to update the tail * pointer. */ - if (tail_page == cpu_buffer->tail_page) { + if (tail_page == READ_ONCE(cpu_buffer->tail_page)) { /* Zero the write counter */ unsigned long val = old_write & ~RB_WRITE_MASK; unsigned long eval = old_entries & ~RB_WRITE_MASK; @@ -1061,14 +1057,9 @@ static int rb_tail_page_update(struct ring_buffer_per_cpu *cpu_buffer, */ local_set(&next_page->page->commit, 0); - old_tail = cmpxchg(&cpu_buffer->tail_page, - tail_page, next_page); - - if (old_tail == tail_page) - ret = 1; + /* Again, either we update tail_page or an interrupt does */ + (void)cmpxchg(&cpu_buffer->tail_page, tail_page, next_page); } - - return ret; } static int rb_check_bpage(struct ring_buffer_per_cpu *cpu_buffer, @@ -1887,12 +1878,6 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - BUF_PAGE_HDR_SIZE; } -static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer) -{ - cpu_buffer->read_stamp = cpu_buffer->reader_page->page->time_stamp; - cpu_buffer->reader_page->read = 0; -} - static void rb_inc_iter(struct ring_buffer_iter *iter) { struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer; @@ -2042,12 +2027,15 @@ rb_handle_head_page(struct ring_buffer_per_cpu *cpu_buffer, * the tail page would have moved. */ if (ret == RB_PAGE_NORMAL) { + struct buffer_page *buffer_tail_page; + + buffer_tail_page = READ_ONCE(cpu_buffer->tail_page); /* * If the tail had moved passed next, then we need * to reset the pointer. */ - if (cpu_buffer->tail_page != tail_page && - cpu_buffer->tail_page != next_page) + if (buffer_tail_page != tail_page && + buffer_tail_page != next_page) rb_head_page_set_normal(cpu_buffer, new_head, next_page, RB_PAGE_HEAD); @@ -2141,6 +2129,8 @@ rb_reset_tail(struct ring_buffer_per_cpu *cpu_buffer, local_sub(length, &tail_page->write); } +static inline void rb_end_commit(struct ring_buffer_per_cpu *cpu_buffer); + /* * This is the slow path, force gcc not to inline it. */ @@ -2153,7 +2143,6 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer *buffer = cpu_buffer->buffer; struct buffer_page *next_page; int ret; - u64 ts; next_page = tail_page; @@ -2227,20 +2216,17 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, } } - ret = rb_tail_page_update(cpu_buffer, tail_page, next_page); - if (ret) { - /* - * Nested commits always have zero deltas, so - * just reread the time stamp - */ - ts = rb_time_stamp(buffer); - next_page->page->time_stamp = ts; - } + rb_tail_page_update(cpu_buffer, tail_page, next_page); out_again: rb_reset_tail(cpu_buffer, tail, info); + /* Commit what we have for now. */ + rb_end_commit(cpu_buffer); + /* rb_end_commit() decs committing */ + local_inc(&cpu_buffer->committing); + /* fail and let the caller try again */ return ERR_PTR(-EAGAIN); @@ -2368,7 +2354,7 @@ rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer, addr = (unsigned long)event; addr &= PAGE_MASK; - bpage = cpu_buffer->tail_page; + bpage = READ_ONCE(cpu_buffer->tail_page); if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) { unsigned long write_mask = @@ -2416,7 +2402,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) again: max_count = cpu_buffer->nr_pages * 100; - while (cpu_buffer->commit_page != cpu_buffer->tail_page) { + while (cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page)) { if (RB_WARN_ON(cpu_buffer, !(--max_count))) return; if (RB_WARN_ON(cpu_buffer, @@ -2425,8 +2411,10 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->commit_page->page->commit, rb_page_write(cpu_buffer->commit_page)); rb_inc_page(cpu_buffer, &cpu_buffer->commit_page); - cpu_buffer->write_stamp = - cpu_buffer->commit_page->page->time_stamp; + /* Only update the write stamp if the page has an event */ + if (rb_page_write(cpu_buffer->commit_page)) + cpu_buffer->write_stamp = + cpu_buffer->commit_page->page->time_stamp; /* add barrier to keep gcc from optimizing too much */ barrier(); } @@ -2449,7 +2437,7 @@ rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer) * and pushed the tail page forward, we will be left with * a dangling commit that will never go forward. */ - if (unlikely(cpu_buffer->commit_page != cpu_buffer->tail_page)) + if (unlikely(cpu_buffer->commit_page != READ_ONCE(cpu_buffer->tail_page))) goto again; } @@ -2705,7 +2693,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(info->add_timestamp)) info->length += RB_LEN_TIME_EXTEND; - tail_page = info->tail_page = cpu_buffer->tail_page; + /* Don't let the compiler play games with cpu_buffer->tail_page */ + tail_page = info->tail_page = READ_ONCE(cpu_buffer->tail_page); write = local_add_return(info->length, &tail_page->write); /* set write to only the index of the write */ @@ -2803,8 +2792,11 @@ rb_reserve_next_event(struct ring_buffer *buffer, event = __rb_reserve_next(cpu_buffer, &info); - if (unlikely(PTR_ERR(event) == -EAGAIN)) + if (unlikely(PTR_ERR(event) == -EAGAIN)) { + if (info.add_timestamp) + info.length -= RB_LEN_TIME_EXTEND; goto again; + } if (!event) goto out_fail; @@ -3626,7 +3618,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) /* Finally update the reader page to the new head */ cpu_buffer->reader_page = reader; - rb_reset_reader_page(cpu_buffer); + cpu_buffer->reader_page->read = 0; if (overwrite != cpu_buffer->last_overrun) { cpu_buffer->lost_events = overwrite - cpu_buffer->last_overrun; @@ -3636,6 +3628,10 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) goto again; out: + /* Update the read_stamp on the first event */ + if (reader && reader->read == 0) + cpu_buffer->read_stamp = reader->page->time_stamp; + arch_spin_unlock(&cpu_buffer->lock); local_irq_restore(flags); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2198a630ef58..a2f0b9f33e9b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -74,11 +74,6 @@ static struct tracer_opt dummy_tracer_opt[] = { { } }; -static struct tracer_flags dummy_tracer_flags = { - .val = 0, - .opts = dummy_tracer_opt -}; - static int dummy_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -100,8 +95,6 @@ static DEFINE_PER_CPU(bool, trace_cmdline_save); */ static int tracing_disabled = 1; -DEFINE_PER_CPU(int, ftrace_cpu_disabled); - cpumask_var_t __read_mostly tracing_buffer_mask; /* @@ -1260,12 +1253,22 @@ int __init register_tracer(struct tracer *type) if (!type->set_flag) type->set_flag = &dummy_set_flag; - if (!type->flags) - type->flags = &dummy_tracer_flags; - else + if (!type->flags) { + /*allocate a dummy tracer_flags*/ + type->flags = kmalloc(sizeof(*type->flags), GFP_KERNEL); + if (!type->flags) { + ret = -ENOMEM; + goto out; + } + type->flags->val = 0; + type->flags->opts = dummy_tracer_opt; + } else if (!type->flags->opts) type->flags->opts = dummy_tracer_opt; + /* store the tracer for __set_tracer_option */ + type->flags->trace = type; + ret = run_tracer_selftest(type); if (ret < 0) goto out; @@ -1661,6 +1664,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, #else TRACE_FLAG_IRQS_NOSUPPORT | #endif + ((pc & NMI_MASK ) ? TRACE_FLAG_NMI : 0) | ((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) | ((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) | (tif_need_resched() ? TRACE_FLAG_NEED_RESCHED : 0) | @@ -1753,7 +1757,7 @@ void trace_buffer_unlock_commit_regs(struct trace_array *tr, { __buffer_unlock_commit(buffer, event); - ftrace_trace_stack(tr, buffer, flags, 6, pc, regs); + ftrace_trace_stack(tr, buffer, flags, 0, pc, regs); ftrace_trace_userstack(buffer, flags, pc); } EXPORT_SYMBOL_GPL(trace_buffer_unlock_commit_regs); @@ -1775,10 +1779,6 @@ trace_function(struct trace_array *tr, struct ring_buffer_event *event; struct ftrace_entry *entry; - /* If we are reading the ring buffer, don't trace */ - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - event = trace_buffer_lock_reserve(buffer, TRACE_FN, sizeof(*entry), flags, pc); if (!event) @@ -2077,20 +2077,20 @@ void trace_printk_init_buffers(void) /* trace_printk() is for debug use only. Don't use it in production. */ - pr_warning("\n"); - pr_warning("**********************************************************\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("** **\n"); - pr_warning("** trace_printk() being used. Allocating extra memory. **\n"); - pr_warning("** **\n"); - pr_warning("** This means that this is a DEBUG kernel and it is **\n"); - pr_warning("** unsafe for production use. **\n"); - pr_warning("** **\n"); - pr_warning("** If you see this message and you are not debugging **\n"); - pr_warning("** the kernel, report this immediately to your vendor! **\n"); - pr_warning("** **\n"); - pr_warning("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); - pr_warning("**********************************************************\n"); + pr_warn("\n"); + pr_warn("**********************************************************\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("** **\n"); + pr_warn("** trace_printk() being used. Allocating extra memory. **\n"); + pr_warn("** **\n"); + pr_warn("** This means that this is a DEBUG kernel and it is **\n"); + pr_warn("** unsafe for production use. **\n"); + pr_warn("** **\n"); + pr_warn("** If you see this message and you are not debugging **\n"); + pr_warn("** the kernel, report this immediately to your vendor! **\n"); + pr_warn("** **\n"); + pr_warn("** NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE NOTICE **\n"); + pr_warn("**********************************************************\n"); /* Expand the buffers to set size */ tracing_update_buffers(); @@ -3511,7 +3511,7 @@ static int __set_tracer_option(struct trace_array *tr, struct tracer_flags *tracer_flags, struct tracer_opt *opts, int neg) { - struct tracer *trace = tr->current_trace; + struct tracer *trace = tracer_flags->trace; int ret; ret = trace->set_flag(tr, tracer_flags->val, opts->bit, !neg); @@ -4107,7 +4107,7 @@ trace_insert_enum_map_file(struct module *mod, struct trace_enum_map **start, */ map_array = kmalloc(sizeof(*map_array) * (len + 2), GFP_KERNEL); if (!map_array) { - pr_warning("Unable to allocate trace enum mapping\n"); + pr_warn("Unable to allocate trace enum mapping\n"); return; } @@ -4554,6 +4554,8 @@ out: return ret; } +#ifdef CONFIG_TRACER_MAX_TRACE + static ssize_t tracing_max_lat_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -4568,6 +4570,8 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf, return tracing_nsecs_write(filp->private_data, ubuf, cnt, ppos); } +#endif + static int tracing_open_pipe(struct inode *inode, struct file *filp) { struct trace_array *tr = inode->i_private; @@ -4951,7 +4955,10 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, spd.nr_pages = i; - ret = splice_to_pipe(pipe, &spd); + if (i) + ret = splice_to_pipe(pipe, &spd); + else + ret = 0; out: splice_shrink_spd(&spd); return ret; @@ -5469,12 +5476,14 @@ static const struct file_operations tracing_thresh_fops = { .llseek = generic_file_llseek, }; +#ifdef CONFIG_TRACER_MAX_TRACE static const struct file_operations tracing_max_lat_fops = { .open = tracing_open_generic, .read = tracing_max_lat_read, .write = tracing_max_lat_write, .llseek = generic_file_llseek, }; +#endif static const struct file_operations set_tracer_fops = { .open = tracing_open_generic, @@ -6131,7 +6140,7 @@ tracing_init_tracefs_percpu(struct trace_array *tr, long cpu) snprintf(cpu_dir, 30, "cpu%ld", cpu); d_cpu = tracefs_create_dir(cpu_dir, d_percpu); if (!d_cpu) { - pr_warning("Could not create tracefs '%s' entry\n", cpu_dir); + pr_warn("Could not create tracefs '%s' entry\n", cpu_dir); return; } @@ -6318,7 +6327,7 @@ struct dentry *trace_create_file(const char *name, ret = tracefs_create_file(name, mode, parent, data, fops); if (!ret) - pr_warning("Could not create tracefs '%s' entry\n", name); + pr_warn("Could not create tracefs '%s' entry\n", name); return ret; } @@ -6337,7 +6346,7 @@ static struct dentry *trace_options_init_dentry(struct trace_array *tr) tr->options = tracefs_create_dir("options", d_tracer); if (!tr->options) { - pr_warning("Could not create tracefs directory 'options'\n"); + pr_warn("Could not create tracefs directory 'options'\n"); return NULL; } @@ -6391,11 +6400,8 @@ create_trace_option_files(struct trace_array *tr, struct tracer *tracer) return; for (i = 0; i < tr->nr_topts; i++) { - /* - * Check if these flags have already been added. - * Some tracers share flags. - */ - if (tr->topts[i].tracer->flags == tracer->flags) + /* Make sure there's no duplicate flags. */ + if (WARN_ON_ONCE(tr->topts[i].tracer->flags == tracer->flags)) return; } @@ -6847,7 +6853,9 @@ struct dentry *tracing_init_dentry(void) if (tr->dir) return NULL; - if (WARN_ON(!debugfs_initialized())) + if (WARN_ON(!tracefs_initialized()) || + (IS_ENABLED(CONFIG_DEBUG_FS) && + WARN_ON(!debugfs_initialized()))) return ERR_PTR(-ENODEV); /* @@ -7246,8 +7254,8 @@ __init static int tracer_alloc_buffers(void) if (trace_boot_clock) { ret = tracing_set_clock(&global_trace, trace_boot_clock); if (ret < 0) - pr_warning("Trace clock %s not defined, going back to default\n", - trace_boot_clock); + pr_warn("Trace clock %s not defined, going back to default\n", + trace_boot_clock); } /* diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dd7620802e72..3fff4adfd431 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -125,6 +125,7 @@ enum trace_flag_type { TRACE_FLAG_HARDIRQ = 0x08, TRACE_FLAG_SOFTIRQ = 0x10, TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, }; #define TRACE_BUF_SIZE 1024 @@ -345,6 +346,7 @@ struct tracer_opt { struct tracer_flags { u32 val; struct tracer_opt *opts; + struct tracer *trace; }; /* Makes more easy to define a tracer opt */ @@ -363,8 +365,8 @@ struct trace_option_dentry { * @name: the name chosen to select it on the available_tracers file * @init: called when one switches to this tracer (echo name > current_tracer) * @reset: called when one switches to another tracer - * @start: called when tracing is unpaused (echo 1 > tracing_enabled) - * @stop: called when tracing is paused (echo 0 > tracing_enabled) + * @start: called when tracing is unpaused (echo 1 > tracing_on) + * @stop: called when tracing is paused (echo 0 > tracing_on) * @update_thresh: called when tracing_thresh is updated * @open: called when the trace file is opened * @pipe_open: called when the trace_pipe file is opened @@ -467,8 +469,6 @@ enum { TRACE_INTERNAL_IRQ_BIT, TRACE_INTERNAL_SIRQ_BIT, - TRACE_CONTROL_BIT, - TRACE_BRANCH_BIT, /* * Abuse of the trace_recursion. @@ -667,7 +667,6 @@ extern int DYN_FTRACE_TEST_NAME2(void); extern bool ring_buffer_expanded; extern bool tracing_selftest_disabled; -DECLARE_PER_CPU(int, ftrace_cpu_disabled); #ifdef CONFIG_FTRACE_STARTUP_TEST extern int trace_selftest_startup_function(struct tracer *trace, @@ -1114,6 +1113,18 @@ struct filter_pred { unsigned short right; }; +static inline bool is_string_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_DYN_STRING || + field->filter_type == FILTER_STATIC_STRING || + field->filter_type == FILTER_PTR_STRING; +} + +static inline bool is_function_field(struct ftrace_event_field *field) +{ + return field->filter_type == FILTER_TRACE_FN; +} + extern enum regex_type filter_parse_regex(char *buff, int len, char **search, int *not); extern void print_event_filter(struct trace_event_file *file, @@ -1162,9 +1173,24 @@ struct event_trigger_data { struct event_filter __rcu *filter; char *filter_str; void *private_data; + bool paused; struct list_head list; }; +extern void trigger_data_free(struct event_trigger_data *data); +extern int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data); +extern int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable); +extern void update_cond_flag(struct trace_event_file *file); +extern void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file); +extern int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file); +extern int register_event_command(struct event_command *cmd); + /** * struct event_trigger_ops - callbacks for trace event triggers * @@ -1177,7 +1203,8 @@ struct event_trigger_data { * @func: The trigger 'probe' function called when the triggering * event occurs. The data passed into this callback is the data * that was supplied to the event_command @reg() function that - * registered the trigger (see struct event_command). + * registered the trigger (see struct event_command) along with + * the trace record, rec. * * @init: An optional initialization function called for the trigger * when the trigger is registered (via the event_command reg() @@ -1202,7 +1229,8 @@ struct event_trigger_data { * (see trace_event_triggers.c). */ struct event_trigger_ops { - void (*func)(struct event_trigger_data *data); + void (*func)(struct event_trigger_data *data, + void *rec); int (*init)(struct event_trigger_ops *ops, struct event_trigger_data *data); void (*free)(struct event_trigger_ops *ops, @@ -1246,27 +1274,10 @@ struct event_trigger_ops { * values are defined by adding new values to the trigger_type * enum in include/linux/trace_events.h. * - * @post_trigger: A flag that says whether or not this command needs - * to have its action delayed until after the current event has - * been closed. Some triggers need to avoid being invoked while - * an event is currently in the process of being logged, since - * the trigger may itself log data into the trace buffer. Thus - * we make sure the current event is committed before invoking - * those triggers. To do that, the trigger invocation is split - * in two - the first part checks the filter using the current - * trace record; if a command has the @post_trigger flag set, it - * sets a bit for itself in the return value, otherwise it - * directly invokes the trigger. Once all commands have been - * either invoked or set their return flag, the current record is - * either committed or discarded. At that point, if any commands - * have deferred their triggers, those commands are finally - * invoked following the close of the current event. In other - * words, if the event_trigger_ops @func() probe implementation - * itself logs to the trace buffer, this flag should be set, - * otherwise it can be left unspecified. + * @flags: See the enum event_command_flags below. * - * All the methods below, except for @set_filter(), must be - * implemented. + * All the methods below, except for @set_filter() and @unreg_all(), + * must be implemented. * * @func: The callback function responsible for parsing and * registering the trigger written to the 'trigger' file by the @@ -1291,6 +1302,10 @@ struct event_trigger_ops { * This is usually implemented by the generic utility function * @unregister_trigger() (see trace_event_triggers.c). * + * @unreg_all: An optional function called to remove all the triggers + * from the list of triggers associated with the event. Called + * when a trigger file is opened in truncate mode. + * * @set_filter: An optional function called to parse and set a filter * for the trigger. If no @set_filter() method is set for the * event command, filters set by the user for the command will be @@ -1304,7 +1319,7 @@ struct event_command { struct list_head list; char *name; enum event_trigger_type trigger_type; - bool post_trigger; + int flags; int (*func)(struct event_command *cmd_ops, struct trace_event_file *file, char *glob, char *cmd, char *params); @@ -1316,12 +1331,56 @@ struct event_command { struct event_trigger_ops *ops, struct event_trigger_data *data, struct trace_event_file *file); + void (*unreg_all)(struct trace_event_file *file); int (*set_filter)(char *filter_str, struct event_trigger_data *data, struct trace_event_file *file); struct event_trigger_ops *(*get_trigger_ops)(char *cmd, char *param); }; +/** + * enum event_command_flags - flags for struct event_command + * + * @POST_TRIGGER: A flag that says whether or not this command needs + * to have its action delayed until after the current event has + * been closed. Some triggers need to avoid being invoked while + * an event is currently in the process of being logged, since + * the trigger may itself log data into the trace buffer. Thus + * we make sure the current event is committed before invoking + * those triggers. To do that, the trigger invocation is split + * in two - the first part checks the filter using the current + * trace record; if a command has the @post_trigger flag set, it + * sets a bit for itself in the return value, otherwise it + * directly invokes the trigger. Once all commands have been + * either invoked or set their return flag, the current record is + * either committed or discarded. At that point, if any commands + * have deferred their triggers, those commands are finally + * invoked following the close of the current event. In other + * words, if the event_trigger_ops @func() probe implementation + * itself logs to the trace buffer, this flag should be set, + * otherwise it can be left unspecified. + * + * @NEEDS_REC: A flag that says whether or not this command needs + * access to the trace record in order to perform its function, + * regardless of whether or not it has a filter associated with + * it (filters make a trigger require access to the trace record + * but are not always present). + */ +enum event_command_flags { + EVENT_CMD_FL_POST_TRIGGER = 1, + EVENT_CMD_FL_NEEDS_REC = 2, +}; + +static inline bool event_command_post_trigger(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_POST_TRIGGER; +} + +static inline bool event_command_needs_rec(struct event_command *cmd_ops) +{ + return cmd_ops->flags & EVENT_CMD_FL_NEEDS_REC; +} + extern int trace_event_enable_disable(struct trace_event_file *file, int enable, int soft_disable); extern int tracing_alloc_snapshot(void); @@ -1368,8 +1427,13 @@ int perf_ftrace_event_register(struct trace_event_call *call, #ifdef CONFIG_FTRACE_SYSCALLS void init_ftrace_syscalls(void); +const char *get_syscall_name(int syscall); #else static inline void init_ftrace_syscalls(void) { } +static inline const char *get_syscall_name(int syscall) +{ + return NULL; +} #endif #ifdef CONFIG_EVENT_TRACING diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c index abfc903e741e..00df25fd86ef 100644 --- a/kernel/trace/trace_event_perf.c +++ b/kernel/trace/trace_event_perf.c @@ -1,7 +1,7 @@ /* * trace event based perf event profiling/tracing * - * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com> + * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> */ @@ -334,7 +334,7 @@ static int perf_ftrace_function_register(struct perf_event *event) { struct ftrace_ops *ops = &event->ftrace_ops; - ops->flags |= FTRACE_OPS_FL_CONTROL; + ops->flags |= FTRACE_OPS_FL_PER_CPU | FTRACE_OPS_FL_RCU; ops->func = perf_ftrace_function_call; return register_ftrace_function(ops); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6bbc5f652355..05ddc0820771 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -97,16 +97,16 @@ trace_find_event_field(struct trace_event_call *call, char *name) struct ftrace_event_field *field; struct list_head *head; - field = __find_event_field(&ftrace_generic_fields, name); + head = trace_get_fields(call); + field = __find_event_field(head, name); if (field) return field; - field = __find_event_field(&ftrace_common_fields, name); + field = __find_event_field(&ftrace_generic_fields, name); if (field) return field; - head = trace_get_fields(call); - return __find_event_field(head, name); + return __find_event_field(&ftrace_common_fields, name); } static int __trace_define_field(struct list_head *head, const char *type, @@ -171,8 +171,10 @@ static int trace_define_generic_fields(void) { int ret; - __generic_field(int, cpu, FILTER_OTHER); - __generic_field(char *, comm, FILTER_PTR_STRING); + __generic_field(int, CPU, FILTER_CPU); + __generic_field(int, cpu, FILTER_CPU); + __generic_field(char *, COMM, FILTER_COMM); + __generic_field(char *, comm, FILTER_COMM); return ret; } @@ -582,6 +584,12 @@ static void __ftrace_clear_event_pids(struct trace_array *tr) unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_pre, tr); unregister_trace_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr); + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, tr); + + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_pre, tr); + unregister_trace_sched_waking(event_filter_pid_sched_wakeup_probe_post, tr); + list_for_each_entry(file, &tr->events, list) { clear_bit(EVENT_FILE_FL_PID_FILTER_BIT, &file->flags); } @@ -863,7 +871,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } @@ -1334,15 +1343,9 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); mutex_lock(&event_mutex); file = event_file_data(filp); @@ -1350,7 +1353,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, err = apply_event_filter(file, buf); mutex_unlock(&event_mutex); - free_page((unsigned long) buf); + kfree(buf); if (err < 0) return err; @@ -1501,18 +1504,12 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; - - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long) buf); - return -EFAULT; - } - buf[cnt] = '\0'; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); err = apply_subsystem_event_filter(dir, buf); - free_page((unsigned long) buf); + kfree(buf); if (err < 0) return err; @@ -1729,6 +1726,16 @@ ftrace_event_pid_write(struct file *filp, const char __user *ubuf, tr, INT_MAX); register_trace_prio_sched_wakeup(event_filter_pid_sched_wakeup_probe_post, tr, 0); + + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_wakeup_new(event_filter_pid_sched_wakeup_probe_post, + tr, 0); + + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_pre, + tr, INT_MAX); + register_trace_prio_sched_waking(event_filter_pid_sched_wakeup_probe_post, + tr, 0); } /* diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f93a219b18da..b3f5051cd4e9 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -961,18 +961,6 @@ int filter_assign_type(const char *type) return FILTER_OTHER; } -static bool is_function_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_TRACE_FN; -} - -static bool is_string_field(struct ftrace_event_field *field) -{ - return field->filter_type == FILTER_DYN_STRING || - field->filter_type == FILTER_STATIC_STRING || - field->filter_type == FILTER_PTR_STRING; -} - static bool is_legal_op(struct ftrace_event_field *field, int op) { if (is_string_field(field) && @@ -1043,13 +1031,14 @@ static int init_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field)) { + if (field->filter_type == FILTER_COMM) { + filter_build_regex(pred); + fn = filter_pred_comm; + pred->regex.field_len = TASK_COMM_LEN; + } else if (is_string_field(field)) { filter_build_regex(pred); - if (!strcmp(field->name, "comm")) { - fn = filter_pred_comm; - pred->regex.field_len = TASK_COMM_LEN; - } else if (field->filter_type == FILTER_STATIC_STRING) { + if (field->filter_type == FILTER_STATIC_STRING) { fn = filter_pred_string; pred->regex.field_len = field->size; } else if (field->filter_type == FILTER_DYN_STRING) @@ -1072,7 +1061,7 @@ static int init_pred(struct filter_parse_state *ps, } pred->val = val; - if (!strcmp(field->name, "cpu")) + if (field->filter_type == FILTER_CPU) fn = filter_pred_cpu; else fn = select_comparison_fn(pred->op, field->size, diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index 42a4009fd75a..d67992f3bb0e 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -28,8 +28,7 @@ static LIST_HEAD(trigger_commands); static DEFINE_MUTEX(trigger_cmd_mutex); -static void -trigger_data_free(struct event_trigger_data *data) +void trigger_data_free(struct event_trigger_data *data) { if (data->cmd_ops->set_filter) data->cmd_ops->set_filter(NULL, data, NULL); @@ -73,18 +72,20 @@ event_triggers_call(struct trace_event_file *file, void *rec) return tt; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (!rec) { - data->ops->func(data); + data->ops->func(data, rec); continue; } filter = rcu_dereference_sched(data->filter); if (filter && !filter_match_preds(filter, rec)) continue; - if (data->cmd_ops->post_trigger) { + if (event_command_post_trigger(data->cmd_ops)) { tt |= data->cmd_ops->trigger_type; continue; } - data->ops->func(data); + data->ops->func(data, rec); } return tt; } @@ -94,6 +95,7 @@ EXPORT_SYMBOL_GPL(event_triggers_call); * event_triggers_post_call - Call 'post_triggers' for a trace event * @file: The trace_event_file associated with the event * @tt: enum event_trigger_type containing a set bit for each trigger to invoke + * @rec: The trace entry for the event * * For each trigger associated with an event, invoke the trigger * function registered with the associated trigger command, if the @@ -104,13 +106,16 @@ EXPORT_SYMBOL_GPL(event_triggers_call); */ void event_triggers_post_call(struct trace_event_file *file, - enum event_trigger_type tt) + enum event_trigger_type tt, + void *rec) { struct event_trigger_data *data; list_for_each_entry_rcu(data, &file->triggers, list) { + if (data->paused) + continue; if (data->cmd_ops->trigger_type & tt) - data->ops->func(data); + data->ops->func(data, rec); } } EXPORT_SYMBOL_GPL(event_triggers_post_call); @@ -188,6 +193,19 @@ static int event_trigger_regex_open(struct inode *inode, struct file *file) return -ENODEV; } + if ((file->f_mode & FMODE_WRITE) && + (file->f_flags & O_TRUNC)) { + struct trace_event_file *event_file; + struct event_command *p; + + event_file = event_file_data(file); + + list_for_each_entry(p, &trigger_commands, list) { + if (p->unreg_all) + p->unreg_all(event_file); + } + } + if (file->f_mode & FMODE_READ) { ret = seq_open(file, &event_triggers_seq_ops); if (!ret) { @@ -237,28 +255,23 @@ static ssize_t event_trigger_regex_write(struct file *file, if (cnt >= PAGE_SIZE) return -EINVAL; - buf = (char *)__get_free_page(GFP_TEMPORARY); - if (!buf) - return -ENOMEM; + buf = memdup_user_nul(ubuf, cnt); + if (IS_ERR(buf)) + return PTR_ERR(buf); - if (copy_from_user(buf, ubuf, cnt)) { - free_page((unsigned long)buf); - return -EFAULT; - } - buf[cnt] = '\0'; strim(buf); mutex_lock(&event_mutex); event_file = event_file_data(file); if (unlikely(!event_file)) { mutex_unlock(&event_mutex); - free_page((unsigned long)buf); + kfree(buf); return -ENODEV; } ret = trigger_process_regex(event_file, buf); mutex_unlock(&event_mutex); - free_page((unsigned long)buf); + kfree(buf); if (ret < 0) goto out; @@ -311,7 +324,7 @@ const struct file_operations event_trigger_fops = { * Currently we only register event commands from __init, so mark this * __init too. */ -static __init int register_event_command(struct event_command *cmd) +__init int register_event_command(struct event_command *cmd) { struct event_command *p; int ret = 0; @@ -400,9 +413,8 @@ event_trigger_print(const char *name, struct seq_file *m, * * Return: 0 on success, errno otherwise */ -static int -event_trigger_init(struct event_trigger_ops *ops, - struct event_trigger_data *data) +int event_trigger_init(struct event_trigger_ops *ops, + struct event_trigger_data *data) { data->ref++; return 0; @@ -430,8 +442,8 @@ event_trigger_free(struct event_trigger_ops *ops, trigger_data_free(data); } -static int trace_event_trigger_enable_disable(struct trace_event_file *file, - int trigger_enable) +int trace_event_trigger_enable_disable(struct trace_event_file *file, + int trigger_enable) { int ret = 0; @@ -488,13 +500,14 @@ clear_event_triggers(struct trace_array *tr) * its TRIGGER_COND bit set, otherwise the TRIGGER_COND bit should be * cleared. */ -static void update_cond_flag(struct trace_event_file *file) +void update_cond_flag(struct trace_event_file *file) { struct event_trigger_data *data; bool set_cond = false; list_for_each_entry_rcu(data, &file->triggers, list) { - if (data->filter || data->cmd_ops->post_trigger) { + if (data->filter || event_command_post_trigger(data->cmd_ops) || + event_command_needs_rec(data->cmd_ops)) { set_cond = true; break; } @@ -543,11 +556,12 @@ static int register_trigger(char *glob, struct event_trigger_ops *ops, list_add_rcu(&data->list, &file->triggers); ret++; + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); + update_cond_flag(file); ret--; } - update_cond_flag(file); out: return ret; } @@ -564,9 +578,9 @@ out: * Usually used directly as the @unreg method in event command * implementations. */ -static void unregister_trigger(char *glob, struct event_trigger_ops *ops, - struct event_trigger_data *test, - struct trace_event_file *file) +void unregister_trigger(char *glob, struct event_trigger_ops *ops, + struct event_trigger_data *test, + struct trace_event_file *file) { struct event_trigger_data *data; bool unregistered = false; @@ -575,8 +589,8 @@ static void unregister_trigger(char *glob, struct event_trigger_ops *ops, if (data->cmd_ops->trigger_type == test->cmd_ops->trigger_type) { unregistered = true; list_del_rcu(&data->list); - update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); break; } } @@ -700,9 +714,9 @@ event_trigger_callback(struct event_command *cmd_ops, * * Return: 0 on success, errno otherwise */ -static int set_trigger_filter(char *filter_str, - struct event_trigger_data *trigger_data, - struct trace_event_file *file) +int set_trigger_filter(char *filter_str, + struct event_trigger_data *trigger_data, + struct trace_event_file *file) { struct event_trigger_data *data = trigger_data; struct event_filter *filter = NULL, *tmp; @@ -751,7 +765,7 @@ static int set_trigger_filter(char *filter_str, } static void -traceon_trigger(struct event_trigger_data *data) +traceon_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -760,7 +774,7 @@ traceon_trigger(struct event_trigger_data *data) } static void -traceon_count_trigger(struct event_trigger_data *data) +traceon_count_trigger(struct event_trigger_data *data, void *rec) { if (tracing_is_on()) return; @@ -775,7 +789,7 @@ traceon_count_trigger(struct event_trigger_data *data) } static void -traceoff_trigger(struct event_trigger_data *data) +traceoff_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -784,7 +798,7 @@ traceoff_trigger(struct event_trigger_data *data) } static void -traceoff_count_trigger(struct event_trigger_data *data) +traceoff_count_trigger(struct event_trigger_data *data, void *rec) { if (!tracing_is_on()) return; @@ -880,13 +894,13 @@ static struct event_command trigger_traceoff_cmd = { #ifdef CONFIG_TRACER_SNAPSHOT static void -snapshot_trigger(struct event_trigger_data *data) +snapshot_trigger(struct event_trigger_data *data, void *rec) { tracing_snapshot(); } static void -snapshot_count_trigger(struct event_trigger_data *data) +snapshot_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -894,7 +908,7 @@ snapshot_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - snapshot_trigger(data); + snapshot_trigger(data, rec); } static int @@ -973,13 +987,13 @@ static __init int register_trigger_snapshot_cmd(void) { return 0; } #define STACK_SKIP 3 static void -stacktrace_trigger(struct event_trigger_data *data) +stacktrace_trigger(struct event_trigger_data *data, void *rec) { trace_dump_stack(STACK_SKIP); } static void -stacktrace_count_trigger(struct event_trigger_data *data) +stacktrace_count_trigger(struct event_trigger_data *data, void *rec) { if (!data->count) return; @@ -987,7 +1001,7 @@ stacktrace_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - stacktrace_trigger(data); + stacktrace_trigger(data, rec); } static int @@ -1021,7 +1035,7 @@ stacktrace_get_trigger_ops(char *cmd, char *param) static struct event_command trigger_stacktrace_cmd = { .name = "stacktrace", .trigger_type = ETT_STACKTRACE, - .post_trigger = true, + .flags = EVENT_CMD_FL_POST_TRIGGER, .func = event_trigger_callback, .reg = register_trigger, .unreg = unregister_trigger, @@ -1058,7 +1072,7 @@ struct enable_trigger_data { }; static void -event_enable_trigger(struct event_trigger_data *data) +event_enable_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1069,7 +1083,7 @@ event_enable_trigger(struct event_trigger_data *data) } static void -event_enable_count_trigger(struct event_trigger_data *data) +event_enable_count_trigger(struct event_trigger_data *data, void *rec) { struct enable_trigger_data *enable_data = data->private_data; @@ -1083,7 +1097,7 @@ event_enable_count_trigger(struct event_trigger_data *data) if (data->count != -1) (data->count)--; - event_enable_trigger(data); + event_enable_trigger(data, rec); } static int @@ -1319,11 +1333,12 @@ static int event_enable_register_trigger(char *glob, list_add_rcu(&data->list, &file->triggers); ret++; + update_cond_flag(file); if (trace_event_trigger_enable_disable(file, 1) < 0) { list_del_rcu(&data->list); + update_cond_flag(file); ret--; } - update_cond_flag(file); out: return ret; } @@ -1344,8 +1359,8 @@ static void event_enable_unregister_trigger(char *glob, (enable_data->file == test_enable_data->file)) { unregistered = true; list_del_rcu(&data->list); - update_cond_flag(file); trace_event_trigger_enable_disable(file, 0); + update_cond_flag(file); break; } } diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index fcd41a166405..5a095c2e4b69 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -219,6 +219,8 @@ static void tracing_stop_function_trace(struct trace_array *tr) unregister_ftrace_function(tr->ops); } +static struct tracer function_trace; + static int func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) { @@ -228,6 +230,10 @@ func_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK)) break; + /* We can change this flag when not running. */ + if (tr->current_trace != &function_trace) + break; + unregister_ftrace_function(tr->ops); if (set) { diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 92382af7a213..3a0244ff7ea8 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -8,6 +8,7 @@ */ #include <linux/uaccess.h> #include <linux/ftrace.h> +#include <linux/interrupt.h> #include <linux/slab.h> #include <linux/fs.h> @@ -288,9 +289,6 @@ int __trace_graph_entry(struct trace_array *tr, struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ent_entry *entry; - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return 0; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_ENT, sizeof(*entry), flags, pc); if (!event) @@ -403,9 +401,6 @@ void __trace_graph_return(struct trace_array *tr, struct ring_buffer *buffer = tr->trace_buffer.buffer; struct ftrace_graph_ret_entry *entry; - if (unlikely(__this_cpu_read(ftrace_cpu_disabled))) - return; - event = trace_buffer_lock_reserve(buffer, TRACE_GRAPH_RET, sizeof(*entry), flags, pc); if (!event) @@ -1356,7 +1351,7 @@ void graph_trace_open(struct trace_iterator *iter) out_err_free: kfree(data); out_err: - pr_warning("function graph tracer: not enough memory\n"); + pr_warn("function graph tracer: not enough memory\n"); } void graph_trace_close(struct trace_iterator *iter) @@ -1474,12 +1469,12 @@ static __init int init_graph_trace(void) max_bytes_for_cpu = snprintf(NULL, 0, "%d", nr_cpu_ids - 1); if (!register_trace_event(&graph_trace_entry_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } if (!register_trace_event(&graph_trace_ret_event)) { - pr_warning("Warning: could not register graph trace events\n"); + pr_warn("Warning: could not register graph trace events\n"); return 1; } diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index e4e56589ec1d..03cdff84d026 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -109,8 +109,12 @@ static int func_prolog_dec(struct trace_array *tr, return 0; local_save_flags(*flags); - /* slight chance to get a false positive on tracing_cpu */ - if (!irqs_disabled_flags(*flags)) + /* + * Slight chance to get a false positive on tracing_cpu, + * although I'm starting to think there isn't a chance. + * Leave this for now just to be paranoid. + */ + if (!irqs_disabled_flags(*flags) && !preempt_count()) return 0; *data = per_cpu_ptr(tr->trace_buffer.data, cpu); @@ -622,7 +626,6 @@ static int __irqsoff_tracer_init(struct trace_array *tr) irqsoff_trace = tr; /* make sure that the tracer is visible */ smp_wmb(); - tracing_reset_online_cpus(&tr->trace_buffer); ftrace_init_array_ops(tr, irqsoff_tracer_call); diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index c9956440d0e6..919e0ddd8fcc 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -30,7 +30,7 @@ struct trace_kprobe { struct list_head list; struct kretprobe rp; /* Use rp.kp for kprobe use */ - unsigned long nhit; + unsigned long __percpu *nhit; const char *symbol; /* symbol name */ struct trace_probe tp; }; @@ -274,6 +274,10 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, if (!tk) return ERR_PTR(ret); + tk->nhit = alloc_percpu(unsigned long); + if (!tk->nhit) + goto error; + if (symbol) { tk->symbol = kstrdup(symbol, GFP_KERNEL); if (!tk->symbol) @@ -313,6 +317,7 @@ static struct trace_kprobe *alloc_trace_kprobe(const char *group, error: kfree(tk->tp.call.name); kfree(tk->symbol); + free_percpu(tk->nhit); kfree(tk); return ERR_PTR(ret); } @@ -327,6 +332,7 @@ static void free_trace_kprobe(struct trace_kprobe *tk) kfree(tk->tp.call.class->system); kfree(tk->tp.call.name); kfree(tk->symbol); + free_percpu(tk->nhit); kfree(tk); } @@ -453,16 +459,14 @@ static int __register_trace_kprobe(struct trace_kprobe *tk) if (ret == 0) tk->tp.flags |= TP_FLAG_REGISTERED; else { - pr_warning("Could not insert probe at %s+%lu: %d\n", - trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); + pr_warn("Could not insert probe at %s+%lu: %d\n", + trace_kprobe_symbol(tk), trace_kprobe_offset(tk), ret); if (ret == -ENOENT && trace_kprobe_is_on_module(tk)) { - pr_warning("This probe might be able to register after" - "target module is loaded. Continue.\n"); + pr_warn("This probe might be able to register after target module is loaded. Continue.\n"); ret = 0; } else if (ret == -EILSEQ) { - pr_warning("Probing address(0x%p) is not an " - "instruction boundary.\n", - tk->rp.kp.addr); + pr_warn("Probing address(0x%p) is not an instruction boundary.\n", + tk->rp.kp.addr); ret = -EINVAL; } } @@ -523,7 +527,7 @@ static int register_trace_kprobe(struct trace_kprobe *tk) /* Register new event */ ret = register_kprobe_event(tk); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } @@ -558,10 +562,9 @@ static int trace_kprobe_module_callback(struct notifier_block *nb, __unregister_trace_kprobe(tk); ret = __register_trace_kprobe(tk); if (ret) - pr_warning("Failed to re-register probe %s on" - "%s: %d\n", - trace_event_name(&tk->tp.call), - mod->name, ret); + pr_warn("Failed to re-register probe %s on %s: %d\n", + trace_event_name(&tk->tp.call), + mod->name, ret); } } mutex_unlock(&probe_lock); @@ -874,9 +877,14 @@ static const struct file_operations kprobe_events_ops = { static int probes_profile_seq_show(struct seq_file *m, void *v) { struct trace_kprobe *tk = v; + unsigned long nhit = 0; + int cpu; + + for_each_possible_cpu(cpu) + nhit += *per_cpu_ptr(tk->nhit, cpu); seq_printf(m, " %-44s %15lu %15lu\n", - trace_event_name(&tk->tp.call), tk->nhit, + trace_event_name(&tk->tp.call), nhit, tk->rp.kp.nmissed); return 0; @@ -1225,7 +1233,7 @@ static int kprobe_dispatcher(struct kprobe *kp, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(kp, struct trace_kprobe, rp.kp); - tk->nhit++; + raw_cpu_inc(*tk->nhit); if (tk->tp.flags & TP_FLAG_TRACE) kprobe_trace_func(tk, regs); @@ -1242,7 +1250,7 @@ kretprobe_dispatcher(struct kretprobe_instance *ri, struct pt_regs *regs) { struct trace_kprobe *tk = container_of(ri->rp, struct trace_kprobe, rp); - tk->nhit++; + raw_cpu_inc(*tk->nhit); if (tk->tp.flags & TP_FLAG_TRACE) kretprobe_trace_func(tk, ri, regs); @@ -1325,16 +1333,14 @@ static __init int init_kprobe_trace(void) /* Event list interface */ if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_events' entry\n"); + pr_warn("Could not create tracefs 'kprobe_events' entry\n"); /* Profile interface */ entry = tracefs_create_file("kprobe_profile", 0444, d_tracer, NULL, &kprobe_profile_ops); if (!entry) - pr_warning("Could not create tracefs " - "'kprobe_profile' entry\n"); + pr_warn("Could not create tracefs 'kprobe_profile' entry\n"); return 0; } fs_initcall(init_kprobe_trace); diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 2be8c4f2403d..68f376ca6d3f 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -146,7 +146,7 @@ static ssize_t mmio_read(struct trace_iterator *iter, struct file *filp, /* XXX: This is later than where events were lost. */ trace_seq_printf(s, "MARK 0.000000 Lost %lu events.\n", n); if (!overrun_detected) - pr_warning("mmiotrace has lost events.\n"); + pr_warn("mmiotrace has lost events\n"); overrun_detected = true; goto print_out; } diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c index 8bb2071474dd..49f61fe96a6b 100644 --- a/kernel/trace/trace_nop.c +++ b/kernel/trace/trace_nop.c @@ -56,7 +56,7 @@ static void nop_trace_reset(struct trace_array *tr) } /* It only serves as a signal handler and a callback to - * accept or refuse tthe setting of a flag. + * accept or refuse the setting of a flag. * If you don't implement it, then the flag setting will be * automatically accepted. */ @@ -75,7 +75,7 @@ static int nop_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set) if (bit == TRACE_NOP_OPT_REFUSE) { printk(KERN_DEBUG "nop_test_refuse flag set to %d: we refuse." - "Now cat trace_options to see the result\n", + " Now cat trace_options to see the result\n", set); return -EINVAL; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 282982195e09..0bb9cf2d53e6 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -389,7 +389,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) char irqs_off; int hardirq; int softirq; + int nmi; + nmi = entry->flags & TRACE_FLAG_NMI; hardirq = entry->flags & TRACE_FLAG_HARDIRQ; softirq = entry->flags & TRACE_FLAG_SOFTIRQ; @@ -415,10 +417,12 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) } hardsoft_irq = + (nmi && hardirq) ? 'Z' : + nmi ? 'z' : (hardirq && softirq) ? 'H' : - hardirq ? 'h' : - softirq ? 's' : - '.'; + hardirq ? 'h' : + softirq ? 's' : + '.' ; trace_seq_printf(s, "%c%c%c", irqs_off, need_resched, hardsoft_irq); diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index 1c2b28536feb..f96f0383f6c6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -273,6 +273,7 @@ static const char **find_next(void *v, loff_t *pos) if (*pos < last_index + start_index) return __start___tracepoint_str + (*pos - last_index); + start_index += last_index; return find_next_mod_format(start_index, v, fmt, pos); } @@ -295,6 +296,9 @@ static int t_show(struct seq_file *m, void *v) const char *str = *fmt; int i; + if (!*fmt) + return 0; + seq_printf(m, "0x%lx : \"", *(unsigned long *)fmt); /* diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index 1769a81da8a7..1d372fa6fefb 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -636,8 +636,8 @@ ssize_t traceprobe_probes_write(struct file *file, const char __user *buffer, *tmp = '\0'; size = tmp - kbuf + 1; } else if (done + size < count) { - pr_warning("Line length is too long: " - "Should be less than %d.", WRITE_BUFSIZE); + pr_warn("Line length is too long: Should be less than %d\n", + WRITE_BUFSIZE); ret = -EINVAL; goto out; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index dda9e6742950..2a1abbaca10e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -126,6 +126,13 @@ check_stack(unsigned long ip, unsigned long *stack) } /* + * Some archs may not have the passed in ip in the dump. + * If that happens, we need to show everything. + */ + if (i == stack_trace_max.nr_entries) + i = 0; + + /* * Now find where in the stack these are. */ x = 0; @@ -149,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack) for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (*p == stack_dump_trace[i]) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index 6cf935316769..413ff108fbd0 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -281,8 +281,7 @@ static int tracing_stat_init(void) stat_dir = tracefs_create_dir("trace_stat", d_tracing); if (!stat_dir) - pr_warning("Could not create tracefs " - "'trace_stat' entry\n"); + pr_warn("Could not create tracefs 'trace_stat' entry\n"); return 0; } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 0655afbea83f..e78f364cc192 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -106,6 +106,17 @@ static struct syscall_metadata *syscall_nr_to_meta(int nr) return syscalls_metadata[nr]; } +const char *get_syscall_name(int syscall) +{ + struct syscall_metadata *entry; + + entry = syscall_nr_to_meta(syscall); + if (!entry) + return NULL; + + return entry->name; +} + static enum print_line_t print_syscall_enter(struct trace_iterator *iter, int flags, struct trace_event *event) @@ -186,11 +197,11 @@ print_syscall_exit(struct trace_iterator *iter, int flags, extern char *__bad_type_size(void); -#define SYSCALL_FIELD(type, name) \ - sizeof(type) != sizeof(trace.name) ? \ +#define SYSCALL_FIELD(type, field, name) \ + sizeof(type) != sizeof(trace.field) ? \ __bad_type_size() : \ - #type, #name, offsetof(typeof(trace), name), \ - sizeof(trace.name), is_signed_type(type) + #type, #name, offsetof(typeof(trace), field), \ + sizeof(trace.field), is_signed_type(type) static int __init __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) @@ -261,7 +272,8 @@ static int __init syscall_enter_define_fields(struct trace_event_call *call) int i; int offset = offsetof(typeof(trace), args); - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), + FILTER_OTHER); if (ret) return ret; @@ -281,11 +293,12 @@ static int __init syscall_exit_define_fields(struct trace_event_call *call) struct syscall_trace_exit trace; int ret; - ret = trace_define_field(call, SYSCALL_FIELD(int, nr), FILTER_OTHER); + ret = trace_define_field(call, SYSCALL_FIELD(int, nr, __syscall_nr), + FILTER_OTHER); if (ret) return ret; - ret = trace_define_field(call, SYSCALL_FIELD(long, ret), + ret = trace_define_field(call, SYSCALL_FIELD(long, ret, ret), FILTER_OTHER); return ret; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index d2f6d0be3503..7915142c89e4 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -334,7 +334,7 @@ static int register_trace_uprobe(struct trace_uprobe *tu) ret = register_uprobe_event(tu); if (ret) { - pr_warning("Failed to register probe event(%d)\n", ret); + pr_warn("Failed to register probe event(%d)\n", ret); goto end; } diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c index ecd536de603a..d0639d917899 100644 --- a/kernel/tracepoint.c +++ b/kernel/tracepoint.c @@ -491,7 +491,7 @@ static __init int init_tracepoints(void) ret = register_module_notifier(&tracepoint_module_nb); if (ret) - pr_warning("Failed to register tracepoint module enter notifier\n"); + pr_warn("Failed to register tracepoint module enter notifier\n"); return ret; } diff --git a/kernel/tsacct.c b/kernel/tsacct.c index 975cb49e32bf..f8e26ab963ed 100644 --- a/kernel/tsacct.c +++ b/kernel/tsacct.c @@ -93,9 +93,11 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) { struct mm_struct *mm; - /* convert pages-usec to Mbyte-usec */ - stats->coremem = p->acct_rss_mem1 * PAGE_SIZE / MB; - stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE / MB; + /* convert pages-nsec/1024 to Mbyte-usec, see __acct_update_integrals */ + stats->coremem = p->acct_rss_mem1 * PAGE_SIZE; + do_div(stats->coremem, 1000 * KB); + stats->virtmem = p->acct_vm_mem1 * PAGE_SIZE; + do_div(stats->virtmem, 1000 * KB); mm = get_task_mm(p); if (mm) { /* adjust to KB unit */ @@ -123,27 +125,28 @@ void xacct_add_tsk(struct taskstats *stats, struct task_struct *p) static void __acct_update_integrals(struct task_struct *tsk, cputime_t utime, cputime_t stime) { - if (likely(tsk->mm)) { - cputime_t time, dtime; - struct timeval value; - unsigned long flags; - u64 delta; - - local_irq_save(flags); - time = stime + utime; - dtime = time - tsk->acct_timexpd; - jiffies_to_timeval(cputime_to_jiffies(dtime), &value); - delta = value.tv_sec; - delta = delta * USEC_PER_SEC + value.tv_usec; - - if (delta == 0) - goto out; - tsk->acct_timexpd = time; - tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm); - tsk->acct_vm_mem1 += delta * tsk->mm->total_vm; - out: - local_irq_restore(flags); - } + cputime_t time, dtime; + u64 delta; + + if (!likely(tsk->mm)) + return; + + time = stime + utime; + dtime = time - tsk->acct_timexpd; + /* Avoid division: cputime_t is often in nanoseconds already. */ + delta = cputime_to_nsecs(dtime); + + if (delta < TICK_NSEC) + return; + + tsk->acct_timexpd = time; + /* + * Divide by 1024 to avoid overflow, and to avoid division. + * The final unit reported to userspace is Mbyte-usecs, + * the rest of the math is done in xacct_add_tsk. + */ + tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm) >> 10; + tsk->acct_vm_mem1 += delta * tsk->mm->total_vm >> 10; } /** @@ -153,9 +156,12 @@ static void __acct_update_integrals(struct task_struct *tsk, void acct_update_integrals(struct task_struct *tsk) { cputime_t utime, stime; + unsigned long flags; + local_irq_save(flags); task_cputime(tsk, &utime, &stime); __acct_update_integrals(tsk, utime, stime); + local_irq_restore(flags); } /** diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c index 88fefa68c516..9bafc211930c 100644 --- a/kernel/user_namespace.c +++ b/kernel/user_namespace.c @@ -602,8 +602,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, struct uid_gid_map new_map; unsigned idx; struct uid_gid_extent *extent = NULL; - unsigned long page = 0; - char *kbuf, *pos, *next_line; + char *kbuf = NULL, *pos, *next_line; ssize_t ret = -EINVAL; /* @@ -638,23 +637,18 @@ static ssize_t map_write(struct file *file, const char __user *buf, if (cap_valid(cap_setid) && !file_ns_capable(file, ns, CAP_SYS_ADMIN)) goto out; - /* Get a buffer */ - ret = -ENOMEM; - page = __get_free_page(GFP_TEMPORARY); - kbuf = (char *) page; - if (!page) - goto out; - /* Only allow < page size writes at the beginning of the file */ ret = -EINVAL; if ((*ppos != 0) || (count >= PAGE_SIZE)) goto out; /* Slurp in the user data */ - ret = -EFAULT; - if (copy_from_user(kbuf, buf, count)) + kbuf = memdup_user_nul(buf, count); + if (IS_ERR(kbuf)) { + ret = PTR_ERR(kbuf); + kbuf = NULL; goto out; - kbuf[count] = '\0'; + } /* Parse the user data */ ret = -EINVAL; @@ -756,8 +750,7 @@ static ssize_t map_write(struct file *file, const char __user *buf, ret = count; out: mutex_unlock(&userns_state_mutex); - if (page) - free_page(page); + kfree(kbuf); return ret; } diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 18f34cf75f74..9acb29f280ec 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -20,6 +20,7 @@ #include <linux/smpboot.h> #include <linux/sched/rt.h> #include <linux/tick.h> +#include <linux/workqueue.h> #include <asm/irq_regs.h> #include <linux/kvm_para.h> @@ -225,7 +226,15 @@ static void __touch_watchdog(void) __this_cpu_write(watchdog_touch_ts, get_timestamp()); } -void touch_softlockup_watchdog(void) +/** + * touch_softlockup_watchdog_sched - touch watchdog on scheduler stalls + * + * Call when the scheduler may have stalled for legitimate reasons + * preventing the watchdog task from executing - e.g. the scheduler + * entering idle state. This should only be used for scheduler events. + * Use touch_softlockup_watchdog() for everything else. + */ +void touch_softlockup_watchdog_sched(void) { /* * Preemption can be enabled. It doesn't matter which CPU's timestamp @@ -233,6 +242,12 @@ void touch_softlockup_watchdog(void) */ raw_cpu_write(watchdog_touch_ts, 0); } + +void touch_softlockup_watchdog(void) +{ + touch_softlockup_watchdog_sched(); + wq_watchdog_touch(raw_smp_processor_id()); +} EXPORT_SYMBOL(touch_softlockup_watchdog); void touch_all_softlockup_watchdogs(void) @@ -246,6 +261,7 @@ void touch_all_softlockup_watchdogs(void) */ for_each_watchdog_cpu(cpu) per_cpu(watchdog_touch_ts, cpu) = 0; + wq_watchdog_touch(-1); } #ifdef CONFIG_HARDLOCKUP_DETECTOR @@ -351,7 +367,7 @@ static void watchdog_overflow_callback(struct perf_event *event, trigger_allbutself_cpu_backtrace(); if (hardlockup_panic) - panic("Hard LOCKUP"); + nmi_panic(regs, "Hard LOCKUP"); __this_cpu_write(hard_watchdog_warn, true); return; @@ -907,6 +923,9 @@ static int proc_watchdog_common(int which, struct ctl_table *table, int write, * both lockup detectors are disabled if proc_watchdog_update() * returns an error. */ + if (old == new) + goto out; + err = proc_watchdog_update(); } out: @@ -951,7 +970,7 @@ int proc_soft_watchdog(struct ctl_table *table, int write, int proc_watchdog_thresh(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - int err, old; + int err, old, new; get_online_cpus(); mutex_lock(&watchdog_proc_mutex); @@ -971,6 +990,10 @@ int proc_watchdog_thresh(struct ctl_table *table, int write, /* * Update the sample period. Restore on failure. */ + new = ACCESS_ONCE(watchdog_thresh); + if (old == new) + goto out; + set_sample_period(); err = proc_watchdog_update(); if (err) { diff --git a/kernel/workqueue.c b/kernel/workqueue.c index c579dbab2e36..2232ae3e3ad6 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -148,6 +148,8 @@ struct worker_pool { int id; /* I: pool ID */ unsigned int flags; /* X: flags */ + unsigned long watchdog_ts; /* L: watchdog timestamp */ + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -299,11 +301,26 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ +/* PL: allowable cpus for unbound wqs and work items */ +static cpumask_var_t wq_unbound_cpumask; + +/* CPU where unbound work was last round robin scheduled from this CPU */ +static DEFINE_PER_CPU(int, wq_rr_cpu_last); + +/* + * Local execution of unbound work items is no longer guaranteed. The + * following always forces round-robin CPU selection on unbound work items + * to uncover usages which depend on it. + */ +#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU +static bool wq_debug_force_rr_cpu = true; +#else +static bool wq_debug_force_rr_cpu = false; +#endif +module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ -static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], - cpu_worker_pools); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], cpu_worker_pools); static DEFINE_IDR(worker_pool_idr); /* PR: idr of all pools */ @@ -568,6 +585,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { assert_rcu_or_wq_mutex_or_pool_mutex(wq); + + /* + * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a + * delayed item is pending. The plan is to keep CPU -> NODE + * mapping valid and stable across CPU on/offlines. Once that + * happens, this workaround can be removed. + */ + if (unlikely(node == NUMA_NO_NODE)) + return wq->dfl_pwq; + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } @@ -830,7 +857,6 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) /** * wq_worker_sleeping - a worker is going to sleep * @task: task going to sleep - * @cpu: CPU in question, must be the current CPU number * * This function is called during schedule() when a busy worker is * going to sleep. Worker on the same cpu can be woken up by @@ -842,7 +868,7 @@ void wq_worker_waking_up(struct task_struct *task, int cpu) * Return: * Worker task on @cpu to wake up, %NULL if none. */ -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) +struct task_struct *wq_worker_sleeping(struct task_struct *task) { struct worker *worker = kthread_data(task), *to_wakeup = NULL; struct worker_pool *pool; @@ -858,7 +884,7 @@ struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu) pool = worker->pool; /* this can only happen on the local cpu */ - if (WARN_ON_ONCE(cpu != raw_smp_processor_id() || pool->cpu != cpu)) + if (WARN_ON_ONCE(pool->cpu != raw_smp_processor_id())) return NULL; /* @@ -1083,6 +1109,8 @@ static void pwq_activate_delayed_work(struct work_struct *work) struct pool_workqueue *pwq = get_work_pwq(work); trace_workqueue_activate_work(work); + if (list_empty(&pwq->pool->worklist)) + pwq->pool->watchdog_ts = jiffies; move_linked_works(work, &pwq->pool->worklist, NULL); __clear_bit(WORK_STRUCT_DELAYED_BIT, work_data_bits(work)); pwq->nr_active++; @@ -1294,6 +1322,39 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } +/* + * When queueing an unbound work item to a wq, prefer local CPU if allowed + * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to + * avoid perturbing sensitive tasks. + */ +static int wq_select_unbound_cpu(int cpu) +{ + static bool printed_dbg_warning; + int new_cpu; + + if (likely(!wq_debug_force_rr_cpu)) { + if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) + return cpu; + } else if (!printed_dbg_warning) { + pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); + printed_dbg_warning = true; + } + + if (cpumask_empty(wq_unbound_cpumask)) + return cpu; + + new_cpu = __this_cpu_read(wq_rr_cpu_last); + new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) { + new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; + } + __this_cpu_write(wq_rr_cpu_last, new_cpu); + + return new_cpu; +} + static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { @@ -1319,7 +1380,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; retry: if (req_cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) @@ -1385,6 +1446,8 @@ retry: trace_workqueue_activate_work(work); pwq->nr_active++; worklist = &pwq->pool->worklist; + if (list_empty(worklist)) + pwq->pool->watchdog_ts = jiffies; } else { work_flags |= WORK_STRUCT_DELAYED; worklist = &pwq->delayed_works; @@ -1458,13 +1521,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; - /* timer isn't guaranteed to run in this cpu, record earlier */ - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - add_timer_on(timer, cpu); + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } /** @@ -2157,6 +2220,8 @@ recheck: list_first_entry(&pool->worklist, struct work_struct, entry); + pool->watchdog_ts = jiffies; + if (likely(!(*work_data_bits(work) & WORK_STRUCT_LINKED))) { /* optimization path, not strictly necessary */ process_one_work(worker, work); @@ -2240,6 +2305,7 @@ repeat: struct pool_workqueue, mayday_node); struct worker_pool *pool = pwq->pool; struct work_struct *work, *n; + bool first = true; __set_current_state(TASK_RUNNING); list_del_init(&pwq->mayday_node); @@ -2256,9 +2322,14 @@ repeat: * process'em. */ WARN_ON_ONCE(!list_empty(scheduled)); - list_for_each_entry_safe(work, n, &pool->worklist, entry) - if (get_work_pwq(work) == pwq) + list_for_each_entry_safe(work, n, &pool->worklist, entry) { + if (get_work_pwq(work) == pwq) { + if (first) + pool->watchdog_ts = jiffies; move_linked_works(work, scheduled, &n); + } + first = false; + } if (!list_empty(scheduled)) { process_scheduled_works(rescuer); @@ -2316,6 +2387,38 @@ repeat: goto repeat; } +/** + * check_flush_dependency - check for flush dependency sanity + * @target_wq: workqueue being flushed + * @target_work: work item being flushed (NULL for workqueue flushes) + * + * %current is trying to flush the whole @target_wq or @target_work on it. + * If @target_wq doesn't have %WQ_MEM_RECLAIM, verify that %current is not + * reclaiming memory or running on a workqueue which doesn't have + * %WQ_MEM_RECLAIM as that can break forward-progress guarantee leading to + * a deadlock. + */ +static void check_flush_dependency(struct workqueue_struct *target_wq, + struct work_struct *target_work) +{ + work_func_t target_func = target_work ? target_work->func : NULL; + struct worker *worker; + + if (target_wq->flags & WQ_MEM_RECLAIM) + return; + + worker = current_wq_worker(); + + WARN_ONCE(current->flags & PF_MEMALLOC, + "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", + current->pid, current->comm, target_wq->name, target_func); + WARN_ONCE(worker && ((worker->current_pwq->wq->flags & + (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), + "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", + worker->current_pwq->wq->name, worker->current_func, + target_wq->name, target_func); +} + struct wq_barrier { struct work_struct work; struct completion done; @@ -2525,6 +2628,8 @@ void flush_workqueue(struct workqueue_struct *wq) list_add_tail(&this_flusher.list, &wq->flusher_overflow); } + check_flush_dependency(wq, NULL); + mutex_unlock(&wq->mutex); wait_for_completion(&this_flusher.done); @@ -2697,6 +2802,8 @@ static bool start_flush_work(struct work_struct *work, struct wq_barrier *barr) pwq = worker->current_pwq; } + check_flush_dependency(pwq->wq, work); + insert_wq_barrier(pwq, barr, work, worker); spin_unlock_irq(&pool->lock); @@ -3069,6 +3176,7 @@ static int init_worker_pool(struct worker_pool *pool) pool->cpu = -1; pool->node = NUMA_NO_NODE; pool->flags |= POOL_DISASSOCIATED; + pool->watchdog_ts = jiffies; INIT_LIST_HEAD(&pool->worklist); INIT_LIST_HEAD(&pool->idle_list); hash_init(pool->busy_hash); @@ -3601,7 +3709,6 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, const struct workqueue_attrs *attrs) { struct apply_wqattrs_ctx *ctx; - int ret = -ENOMEM; /* only unbound workqueues can change attributes */ if (WARN_ON(!(wq->flags & WQ_UNBOUND))) @@ -3612,16 +3719,14 @@ static int apply_workqueue_attrs_locked(struct workqueue_struct *wq, return -EINVAL; ctx = apply_wqattrs_prepare(wq, attrs); + if (!ctx) + return -ENOMEM; /* the ctx has been prepared successfully, let's commit it */ - if (ctx) { - apply_wqattrs_commit(ctx); - ret = 0; - } - + apply_wqattrs_commit(ctx); apply_wqattrs_cleanup(ctx); - return ret; + return 0; } /** @@ -4308,7 +4413,9 @@ void show_workqueue_state(void) pr_info("pool %d:", pool->id); pr_cont_pool_info(pool); - pr_cont(" workers=%d", pool->nr_workers); + pr_cont(" hung=%us workers=%d", + jiffies_to_msecs(jiffies - pool->watchdog_ts) / 1000, + pool->nr_workers); if (pool->manager) pr_cont(" manager: %d", task_pid_nr(pool->manager->task)); @@ -4587,7 +4694,7 @@ static void work_for_cpu_fn(struct work_struct *work) } /** - * work_on_cpu - run a function in user context on a particular cpu + * work_on_cpu - run a function in thread context on a particular cpu * @cpu: the cpu to run on * @fn: the function to run * @arg: the function arg @@ -5113,8 +5220,8 @@ int workqueue_sysfs_register(struct workqueue_struct *wq) wq_dev->wq = wq; wq_dev->dev.bus = &wq_subsys; - wq_dev->dev.init_name = wq->name; wq_dev->dev.release = wq_device_release; + dev_set_name(&wq_dev->dev, "%s", wq->name); /* * unbound_attrs are created separately. Suppress uevent until @@ -5167,6 +5274,154 @@ static void workqueue_sysfs_unregister(struct workqueue_struct *wq) static void workqueue_sysfs_unregister(struct workqueue_struct *wq) { } #endif /* CONFIG_SYSFS */ +/* + * Workqueue watchdog. + * + * Stall may be caused by various bugs - missing WQ_MEM_RECLAIM, illegal + * flush dependency, a concurrency managed work item which stays RUNNING + * indefinitely. Workqueue stalls can be very difficult to debug as the + * usual warning mechanisms don't trigger and internal workqueue state is + * largely opaque. + * + * Workqueue watchdog monitors all worker pools periodically and dumps + * state if some pools failed to make forward progress for a while where + * forward progress is defined as the first item on ->worklist changing. + * + * This mechanism is controlled through the kernel parameter + * "workqueue.watchdog_thresh" which can be updated at runtime through the + * corresponding sysfs parameter file. + */ +#ifdef CONFIG_WQ_WATCHDOG + +static void wq_watchdog_timer_fn(unsigned long data); + +static unsigned long wq_watchdog_thresh = 30; +static struct timer_list wq_watchdog_timer = + TIMER_DEFERRED_INITIALIZER(wq_watchdog_timer_fn, 0, 0); + +static unsigned long wq_watchdog_touched = INITIAL_JIFFIES; +static DEFINE_PER_CPU(unsigned long, wq_watchdog_touched_cpu) = INITIAL_JIFFIES; + +static void wq_watchdog_reset_touched(void) +{ + int cpu; + + wq_watchdog_touched = jiffies; + for_each_possible_cpu(cpu) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; +} + +static void wq_watchdog_timer_fn(unsigned long data) +{ + unsigned long thresh = READ_ONCE(wq_watchdog_thresh) * HZ; + bool lockup_detected = false; + struct worker_pool *pool; + int pi; + + if (!thresh) + return; + + rcu_read_lock(); + + for_each_pool(pool, pi) { + unsigned long pool_ts, touched, ts; + + if (list_empty(&pool->worklist)) + continue; + + /* get the latest of pool and touched timestamps */ + pool_ts = READ_ONCE(pool->watchdog_ts); + touched = READ_ONCE(wq_watchdog_touched); + + if (time_after(pool_ts, touched)) + ts = pool_ts; + else + ts = touched; + + if (pool->cpu >= 0) { + unsigned long cpu_touched = + READ_ONCE(per_cpu(wq_watchdog_touched_cpu, + pool->cpu)); + if (time_after(cpu_touched, ts)) + ts = cpu_touched; + } + + /* did we stall? */ + if (time_after(jiffies, ts + thresh)) { + lockup_detected = true; + pr_emerg("BUG: workqueue lockup - pool"); + pr_cont_pool_info(pool); + pr_cont(" stuck for %us!\n", + jiffies_to_msecs(jiffies - pool_ts) / 1000); + } + } + + rcu_read_unlock(); + + if (lockup_detected) + show_workqueue_state(); + + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh); +} + +void wq_watchdog_touch(int cpu) +{ + if (cpu >= 0) + per_cpu(wq_watchdog_touched_cpu, cpu) = jiffies; + else + wq_watchdog_touched = jiffies; +} + +static void wq_watchdog_set_thresh(unsigned long thresh) +{ + wq_watchdog_thresh = 0; + del_timer_sync(&wq_watchdog_timer); + + if (thresh) { + wq_watchdog_thresh = thresh; + wq_watchdog_reset_touched(); + mod_timer(&wq_watchdog_timer, jiffies + thresh * HZ); + } +} + +static int wq_watchdog_param_set_thresh(const char *val, + const struct kernel_param *kp) +{ + unsigned long thresh; + int ret; + + ret = kstrtoul(val, 0, &thresh); + if (ret) + return ret; + + if (system_wq) + wq_watchdog_set_thresh(thresh); + else + wq_watchdog_thresh = thresh; + + return 0; +} + +static const struct kernel_param_ops wq_watchdog_thresh_ops = { + .set = wq_watchdog_param_set_thresh, + .get = param_get_ulong, +}; + +module_param_cb(watchdog_thresh, &wq_watchdog_thresh_ops, &wq_watchdog_thresh, + 0644); + +static void wq_watchdog_init(void) +{ + wq_watchdog_set_thresh(wq_watchdog_thresh); +} + +#else /* CONFIG_WQ_WATCHDOG */ + +static inline void wq_watchdog_init(void) { } + +#endif /* CONFIG_WQ_WATCHDOG */ + static void __init wq_numa_init(void) { cpumask_var_t *tbl; @@ -5290,6 +5545,9 @@ static int __init init_workqueues(void) !system_unbound_wq || !system_freezable_wq || !system_power_efficient_wq || !system_freezable_power_efficient_wq); + + wq_watchdog_init(); + return 0; } early_initcall(init_workqueues); diff --git a/kernel/workqueue_internal.h b/kernel/workqueue_internal.h index 45215870ac6c..8635417c587b 100644 --- a/kernel/workqueue_internal.h +++ b/kernel/workqueue_internal.h @@ -69,6 +69,6 @@ static inline struct worker *current_wq_worker(void) * sched/core.c and workqueue.c. */ void wq_worker_waking_up(struct task_struct *task, int cpu); -struct task_struct *wq_worker_sleeping(struct task_struct *task, int cpu); +struct task_struct *wq_worker_sleeping(struct task_struct *task); #endif /* _KERNEL_WORKQUEUE_INTERNAL_H */ |