diff options
Diffstat (limited to 'kernel')
31 files changed, 1255 insertions, 954 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c index b0799bced518..89ebbc4d1164 100644 --- a/kernel/bpf/arraymap.c +++ b/kernel/bpf/arraymap.c @@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) { struct perf_event *event; const struct perf_event_attr *attr; + struct file *file; - event = perf_event_get(fd); - if (IS_ERR(event)) - return event; + file = perf_event_get(fd); + if (IS_ERR(file)) + return file; + + event = file->private_data; attr = perf_event_attrs(event); if (IS_ERR(attr)) @@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd) goto err; if (attr->type == PERF_TYPE_RAW) - return event; + return file; if (attr->type == PERF_TYPE_HARDWARE) - return event; + return file; if (attr->type == PERF_TYPE_SOFTWARE && attr->config == PERF_COUNT_SW_BPF_OUTPUT) - return event; + return file; err: - perf_event_release_kernel(event); + fput(file); return ERR_PTR(-EINVAL); } static void perf_event_fd_array_put_ptr(void *ptr) { - struct perf_event *event = ptr; - - perf_event_release_kernel(event); + fput((struct file *)ptr); } static const struct bpf_map_ops perf_event_array_ops = { diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index d1d3e8f57de9..2e7f7ab739e4 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -2082,7 +2082,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta) /* adjust offset of jmps if necessary */ if (i < pos && i + insn->off + 1 > pos) insn->off += delta; - else if (i > pos && i + insn->off + 1 < pos) + else if (i > pos + delta && i + insn->off + 1 <= pos + delta) insn->off -= delta; } } diff --git a/kernel/cgroup.c b/kernel/cgroup.c index c03a640ef6da..d27904c193da 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -58,6 +58,7 @@ #include <linux/kthread.h> #include <linux/delay.h> #include <linux/atomic.h> +#include <linux/cpuset.h> #include <net/sock.h> /* @@ -2739,6 +2740,7 @@ out_unlock_rcu: out_unlock_threadgroup: percpu_up_write(&cgroup_threadgroup_rwsem); cgroup_kn_unlock(of->kn); + cpuset_post_attach_flush(); return ret ?: nbytes; } @@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work) if (ss) { /* css free path */ + struct cgroup_subsys_state *parent = css->parent; int id = css->id; - if (css->parent) - css_put(css->parent); - ss->css_free(css); cgroup_idr_remove(&ss->css_idr, id); cgroup_put(cgrp); + + if (parent) + css_put(parent); } else { /* cgroup free path */ atomic_dec(&cgrp->root->nr_cgrps); @@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css, INIT_LIST_HEAD(&css->sibling); INIT_LIST_HEAD(&css->children); css->serial_nr = css_serial_nr_next++; + atomic_set(&css->online_cnt, 0); if (cgroup_parent(cgrp)) { css->parent = cgroup_css(cgroup_parent(cgrp), ss); @@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css) if (!ret) { css->flags |= CSS_ONLINE; rcu_assign_pointer(css->cgroup->subsys[ss->id], css); + + atomic_inc(&css->online_cnt); + if (css->parent) + atomic_inc(&css->parent->online_cnt); } return ret; } @@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work) container_of(work, struct cgroup_subsys_state, destroy_work); mutex_lock(&cgroup_mutex); - offline_css(css); - mutex_unlock(&cgroup_mutex); - css_put(css); + do { + offline_css(css); + css_put(css); + /* @css can't go away while we're holding cgroup_mutex */ + css = css->parent; + } while (css && atomic_dec_and_test(&css->online_cnt)); + + mutex_unlock(&cgroup_mutex); } /* css kill confirmation processing requires process context, bounce */ @@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref) struct cgroup_subsys_state *css = container_of(ref, struct cgroup_subsys_state, refcnt); - INIT_WORK(&css->destroy_work, css_killed_work_fn); - queue_work(cgroup_destroy_wq, &css->destroy_work); + if (atomic_dec_and_test(&css->online_cnt)) { + INIT_WORK(&css->destroy_work, css_killed_work_fn); + queue_work(cgroup_destroy_wq, &css->destroy_work); + } } /** diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 3e945fcd8179..41989ab4db57 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -287,6 +287,8 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(cpuset_mutex); static DEFINE_SPINLOCK(callback_lock); +static struct workqueue_struct *cpuset_migrate_mm_wq; + /* * CPU / memory hotplug is handled asynchronously. */ @@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, } /* - * cpuset_migrate_mm - * - * Migrate memory region from one set of nodes to another. - * - * Temporarilly set tasks mems_allowed to target nodes of migration, - * so that the migration code can allocate pages on these nodes. - * - * While the mm_struct we are migrating is typically from some - * other task, the task_struct mems_allowed that we are hacking - * is for our current task, which must allocate new pages for that - * migrating memory region. + * Migrate memory region from one set of nodes to another. This is + * performed asynchronously as it can be called from process migration path + * holding locks involved in process management. All mm migrations are + * performed in the queued order and can be waited for by flushing + * cpuset_migrate_mm_wq. */ +struct cpuset_migrate_mm_work { + struct work_struct work; + struct mm_struct *mm; + nodemask_t from; + nodemask_t to; +}; + +static void cpuset_migrate_mm_workfn(struct work_struct *work) +{ + struct cpuset_migrate_mm_work *mwork = + container_of(work, struct cpuset_migrate_mm_work, work); + + /* on a wq worker, no need to worry about %current's mems_allowed */ + do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL); + mmput(mwork->mm); + kfree(mwork); +} + static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from, const nodemask_t *to) { - struct task_struct *tsk = current; - - tsk->mems_allowed = *to; + struct cpuset_migrate_mm_work *mwork; - do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL); + mwork = kzalloc(sizeof(*mwork), GFP_KERNEL); + if (mwork) { + mwork->mm = mm; + mwork->from = *from; + mwork->to = *to; + INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn); + queue_work(cpuset_migrate_mm_wq, &mwork->work); + } else { + mmput(mm); + } +} - rcu_read_lock(); - guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed); - rcu_read_unlock(); +void cpuset_post_attach_flush(void) +{ + flush_workqueue(cpuset_migrate_mm_wq); } /* @@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs) mpol_rebind_mm(mm, &cs->mems_allowed); if (migrate) cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems); - mmput(mm); + else + mmput(mm); } css_task_iter_end(&it); @@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset) * @old_mems_allowed is the right nodesets that we * migrate mm from. */ - if (is_memory_migrate(cs)) { + if (is_memory_migrate(cs)) cpuset_migrate_mm(mm, &oldcs->old_mems_allowed, &cpuset_attach_nodemask_to); - } - mmput(mm); + else + mmput(mm); } } @@ -1714,6 +1737,7 @@ out_unlock: mutex_unlock(&cpuset_mutex); kernfs_unbreak_active_protection(of->kn); css_put(&cs->css); + flush_workqueue(cpuset_migrate_mm_wq); return retval ?: nbytes; } @@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void) top_cpuset.effective_mems = node_states[N_MEMORY]; register_hotmemory_notifier(&cpuset_track_online_nodes_nb); + + cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0); + BUG_ON(!cpuset_migrate_mm_wq); } /** diff --git a/kernel/events/core.c b/kernel/events/core.c index 06ae52e99ac2..614614821f00 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -49,8 +49,6 @@ #include <asm/irq_regs.h> -static struct workqueue_struct *perf_wq; - typedef int (*remote_function_f)(void *); struct remote_function_call { @@ -66,8 +64,17 @@ static void remote_function(void *data) struct task_struct *p = tfc->p; if (p) { - tfc->ret = -EAGAIN; - if (task_cpu(p) != smp_processor_id() || !task_curr(p)) + /* -EAGAIN */ + if (task_cpu(p) != smp_processor_id()) + return; + + /* + * Now that we're on right CPU with IRQs disabled, we can test + * if we hit the right task without races. + */ + + tfc->ret = -ESRCH; /* No such (running) process */ + if (p != current) return; } @@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info) .p = p, .func = func, .info = info, - .ret = -ESRCH, /* No such (running) process */ + .ret = -EAGAIN, }; + int ret; - if (task_curr(p)) - smp_call_function_single(task_cpu(p), remote_function, &data, 1); + do { + ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1); + if (!ret) + ret = data.ret; + } while (ret == -EAGAIN); - return data.ret; + return ret; } /** @@ -126,44 +137,170 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info) return data.ret; } -static void event_function_call(struct perf_event *event, - int (*active)(void *), - void (*inactive)(void *), - void *data) +static inline struct perf_cpu_context * +__get_cpu_context(struct perf_event_context *ctx) +{ + return this_cpu_ptr(ctx->pmu->pmu_cpu_context); +} + +static void perf_ctx_lock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx) + raw_spin_lock(&ctx->lock); +} + +static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (ctx) + raw_spin_unlock(&ctx->lock); + raw_spin_unlock(&cpuctx->ctx.lock); +} + +#define TASK_TOMBSTONE ((void *)-1L) + +static bool is_kernel_event(struct perf_event *event) { + return READ_ONCE(event->owner) == TASK_TOMBSTONE; +} + +/* + * On task ctx scheduling... + * + * When !ctx->nr_events a task context will not be scheduled. This means + * we can disable the scheduler hooks (for performance) without leaving + * pending task ctx state. + * + * This however results in two special cases: + * + * - removing the last event from a task ctx; this is relatively straight + * forward and is done in __perf_remove_from_context. + * + * - adding the first event to a task ctx; this is tricky because we cannot + * rely on ctx->is_active and therefore cannot use event_function_call(). + * See perf_install_in_context(). + * + * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set. + */ + +typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *, + struct perf_event_context *, void *); + +struct event_function_struct { + struct perf_event *event; + event_f func; + void *data; +}; + +static int event_function(void *info) +{ + struct event_function_struct *efs = info; + struct perf_event *event = efs->event; struct perf_event_context *ctx = event->ctx; - struct task_struct *task = ctx->task; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event_context *task_ctx = cpuctx->task_ctx; + int ret = 0; + + WARN_ON_ONCE(!irqs_disabled()); + + perf_ctx_lock(cpuctx, task_ctx); + /* + * Since we do the IPI call without holding ctx->lock things can have + * changed, double check we hit the task we set out to hit. + */ + if (ctx->task) { + if (ctx->task != current) { + ret = -ESRCH; + goto unlock; + } + + /* + * We only use event_function_call() on established contexts, + * and event_function() is only ever called when active (or + * rather, we'll have bailed in task_function_call() or the + * above ctx->task != current test), therefore we must have + * ctx->is_active here. + */ + WARN_ON_ONCE(!ctx->is_active); + /* + * And since we have ctx->is_active, cpuctx->task_ctx must + * match. + */ + WARN_ON_ONCE(task_ctx != ctx); + } else { + WARN_ON_ONCE(&cpuctx->ctx != ctx); + } + + efs->func(event, cpuctx, ctx, efs->data); +unlock: + perf_ctx_unlock(cpuctx, task_ctx); + + return ret; +} + +static void event_function_local(struct perf_event *event, event_f func, void *data) +{ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + int ret = event_function(&efs); + WARN_ON_ONCE(ret); +} + +static void event_function_call(struct perf_event *event, event_f func, void *data) +{ + struct perf_event_context *ctx = event->ctx; + struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */ + struct event_function_struct efs = { + .event = event, + .func = func, + .data = data, + }; + + if (!event->parent) { + /* + * If this is a !child event, we must hold ctx::mutex to + * stabilize the the event->ctx relation. See + * perf_event_ctx_lock(). + */ + lockdep_assert_held(&ctx->mutex); + } if (!task) { - cpu_function_call(event->cpu, active, data); + cpu_function_call(event->cpu, event_function, &efs); return; } + if (task == TASK_TOMBSTONE) + return; + again: - if (!task_function_call(task, active, data)) + if (!task_function_call(task, event_function, &efs)) return; raw_spin_lock_irq(&ctx->lock); + /* + * Reload the task pointer, it might have been changed by + * a concurrent perf_event_context_sched_out(). + */ + task = ctx->task; + if (task == TASK_TOMBSTONE) { + raw_spin_unlock_irq(&ctx->lock); + return; + } if (ctx->is_active) { - /* - * Reload the task pointer, it might have been changed by - * a concurrent perf_event_context_sched_out(). - */ - task = ctx->task; raw_spin_unlock_irq(&ctx->lock); goto again; } - inactive(data); + func(event, NULL, ctx, data); raw_spin_unlock_irq(&ctx->lock); } -#define EVENT_OWNER_KERNEL ((void *) -1) - -static bool is_kernel_event(struct perf_event *event) -{ - return event->owner == EVENT_OWNER_KERNEL; -} - #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\ PERF_FLAG_FD_OUTPUT |\ PERF_FLAG_PID_CGROUP |\ @@ -179,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event) enum event_type_t { EVENT_FLEXIBLE = 0x1, EVENT_PINNED = 0x2, + EVENT_TIME = 0x4, EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED, }; @@ -186,7 +324,13 @@ enum event_type_t { * perf_sched_events : >0 events exist * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu */ -struct static_key_deferred perf_sched_events __read_mostly; + +static void perf_sched_delayed(struct work_struct *work); +DEFINE_STATIC_KEY_FALSE(perf_sched_events); +static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed); +static DEFINE_MUTEX(perf_sched_mutex); +static atomic_t perf_sched_count; + static DEFINE_PER_CPU(atomic_t, perf_cgroup_events); static DEFINE_PER_CPU(int, perf_sched_cb_usages); @@ -368,28 +512,6 @@ static inline u64 perf_event_clock(struct perf_event *event) return event->clock(); } -static inline struct perf_cpu_context * -__get_cpu_context(struct perf_event_context *ctx) -{ - return this_cpu_ptr(ctx->pmu->pmu_cpu_context); -} - -static void perf_ctx_lock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - raw_spin_lock(&cpuctx->ctx.lock); - if (ctx) - raw_spin_lock(&ctx->lock); -} - -static void perf_ctx_unlock(struct perf_cpu_context *cpuctx, - struct perf_event_context *ctx) -{ - if (ctx) - raw_spin_unlock(&ctx->lock); - raw_spin_unlock(&cpuctx->ctx.lock); -} - #ifdef CONFIG_CGROUP_PERF static inline bool @@ -579,13 +701,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* - * next is NULL when called from perf_event_enable_on_exec() - * that will systematically cause a cgroup_switch() - */ - if (next) - cgrp2 = perf_cgroup_from_task(next, NULL); + cgrp2 = perf_cgroup_from_task(next, NULL); /* * only schedule out current cgroup events if we know @@ -611,8 +727,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev, * we are holding the rcu lock */ cgrp1 = perf_cgroup_from_task(task, NULL); - - /* prev can never be NULL */ cgrp2 = perf_cgroup_from_task(prev, NULL); /* @@ -917,7 +1031,7 @@ static void put_ctx(struct perf_event_context *ctx) if (atomic_dec_and_test(&ctx->refcount)) { if (ctx->parent_ctx) put_ctx(ctx->parent_ctx); - if (ctx->task) + if (ctx->task && ctx->task != TASK_TOMBSTONE) put_task_struct(ctx->task); call_rcu(&ctx->rcu_head, free_ctx); } @@ -934,9 +1048,8 @@ static void put_ctx(struct perf_event_context *ctx) * perf_event_context::mutex nests and those are: * * - perf_event_exit_task_context() [ child , 0 ] - * __perf_event_exit_task() - * sync_child_event() - * put_event() [ parent, 1 ] + * perf_event_exit_event() + * put_event() [ parent, 1 ] * * - perf_event_init_context() [ parent, 0 ] * inherit_task_group() @@ -979,8 +1092,8 @@ static void put_ctx(struct perf_event_context *ctx) * Lock order: * task_struct::perf_event_mutex * perf_event_context::mutex - * perf_event_context::lock * perf_event::child_mutex; + * perf_event_context::lock * perf_event::mmap_mutex * mmap_sem */ @@ -1078,6 +1191,7 @@ static u64 primary_event_id(struct perf_event *event) /* * Get the perf_event_context for a task and lock it. + * * This has to cope with with the fact that until it is locked, * the context could get moved to another task. */ @@ -1118,9 +1232,12 @@ retry: goto retry; } - if (!atomic_inc_not_zero(&ctx->refcount)) { + if (ctx->task == TASK_TOMBSTONE || + !atomic_inc_not_zero(&ctx->refcount)) { raw_spin_unlock(&ctx->lock); ctx = NULL; + } else { + WARN_ON_ONCE(ctx->task != task); } } rcu_read_unlock(); @@ -1180,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event) /* * Update the total_time_enabled and total_time_running fields for a event. - * The caller of this function needs to hold the ctx->lock. */ static void update_event_times(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; u64 run_end; + lockdep_assert_held(&ctx->lock); + if (event->state < PERF_EVENT_STATE_INACTIVE || event->group_leader->state < PERF_EVENT_STATE_INACTIVE) return; + /* * in cgroup mode, time_enabled represents * the time the event was enabled AND active @@ -1246,6 +1365,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx) static void list_add_event(struct perf_event *event, struct perf_event_context *ctx) { + lockdep_assert_held(&ctx->lock); + WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT); event->attach_state |= PERF_ATTACH_CONTEXT; @@ -1448,11 +1569,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx) if (is_cgroup_event(event)) { ctx->nr_cgroups--; + /* + * Because cgroup events are always per-cpu events, this will + * always be called from the right CPU. + */ cpuctx = __get_cpu_context(ctx); /* - * if there are no more cgroup events - * then cler cgrp to avoid stale pointer - * in update_cgrp_time_from_cpuctx() + * If there are no more cgroup events then clear cgrp to avoid + * stale pointer in update_cgrp_time_from_cpuctx(). */ if (!ctx->nr_cgroups) cpuctx->cgrp = NULL; @@ -1530,45 +1654,11 @@ out: perf_event__header_size(tmp); } -/* - * User event without the task. - */ static bool is_orphaned_event(struct perf_event *event) { - return event && !is_kernel_event(event) && !event->owner; + return event->state == PERF_EVENT_STATE_DEAD; } -/* - * Event has a parent but parent's task finished and it's - * alive only because of children holding refference. - */ -static bool is_orphaned_child(struct perf_event *event) -{ - return is_orphaned_event(event->parent); -} - -static void orphans_remove_work(struct work_struct *work); - -static void schedule_orphans_remove(struct perf_event_context *ctx) -{ - if (!ctx->task || ctx->orphans_remove_sched || !perf_wq) - return; - - if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) { - get_ctx(ctx); - ctx->orphans_remove_sched = true; - } -} - -static int __init perf_workqueue_init(void) -{ - perf_wq = create_singlethread_workqueue("perf"); - WARN(!perf_wq, "failed to create perf workqueue\n"); - return perf_wq ? 0 : -1; -} - -core_initcall(perf_workqueue_init); - static inline int pmu_filter_match(struct perf_event *event) { struct pmu *pmu = event->pmu; @@ -1611,14 +1701,14 @@ event_sched_out(struct perf_event *event, perf_pmu_disable(event->pmu); + event->tstamp_stopped = tstamp; + event->pmu->del(event, 0); + event->oncpu = -1; event->state = PERF_EVENT_STATE_INACTIVE; if (event->pending_disable) { event->pending_disable = 0; event->state = PERF_EVENT_STATE_OFF; } - event->tstamp_stopped = tstamp; - event->pmu->del(event, 0); - event->oncpu = -1; if (!is_software_event(event)) cpuctx->active_oncpu--; @@ -1629,9 +1719,6 @@ event_sched_out(struct perf_event *event, if (event->attr.exclusive || !cpuctx->active_oncpu) cpuctx->exclusive = 0; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - perf_pmu_enable(event->pmu); } @@ -1655,21 +1742,7 @@ group_sched_out(struct perf_event *group_event, cpuctx->exclusive = 0; } -struct remove_event { - struct perf_event *event; - bool detach_group; -}; - -static void ___perf_remove_from_context(void *info) -{ - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - - if (re->detach_group) - perf_group_detach(event); - list_del_event(event, ctx); -} +#define DETACH_GROUP 0x01UL /* * Cross CPU call to remove a performance event @@ -1677,33 +1750,31 @@ static void ___perf_remove_from_context(void *info) * We disable the event on the hardware level first. After that we * remove it from the context list. */ -static int __perf_remove_from_context(void *info) +static void +__perf_remove_from_context(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct remove_event *re = info; - struct perf_event *event = re->event; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + unsigned long flags = (unsigned long)info; - raw_spin_lock(&ctx->lock); event_sched_out(event, cpuctx, ctx); - if (re->detach_group) + if (flags & DETACH_GROUP) perf_group_detach(event); list_del_event(event, ctx); - if (!ctx->nr_events && cpuctx->task_ctx == ctx) { + + if (!ctx->nr_events && ctx->is_active) { ctx->is_active = 0; - cpuctx->task_ctx = NULL; + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + cpuctx->task_ctx = NULL; + } } - raw_spin_unlock(&ctx->lock); - - return 0; } /* * Remove the event from a task's (or a CPU's) list of events. * - * CPU events are removed with a smp call. For task events we only - * call when the task is on a CPU. - * * If event->ctx is a cloned context, callers must make sure that * every task struct that event->ctx->task could possibly point to * remains valid. This is OK when called from perf_release since @@ -1711,73 +1782,32 @@ static int __perf_remove_from_context(void *info) * When called from perf_event_exit_task, it's OK because the * context has been detached from its task. */ -static void perf_remove_from_context(struct perf_event *event, bool detach_group) +static void perf_remove_from_context(struct perf_event *event, unsigned long flags) { - struct perf_event_context *ctx = event->ctx; - struct remove_event re = { - .event = event, - .detach_group = detach_group, - }; - - lockdep_assert_held(&ctx->mutex); + lockdep_assert_held(&event->ctx->mutex); - event_function_call(event, __perf_remove_from_context, - ___perf_remove_from_context, &re); + event_function_call(event, __perf_remove_from_context, (void *)flags); } /* * Cross CPU call to disable a performance event */ -int __perf_event_disable(void *info) -{ - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - /* - * If this is a per-task event, need to check whether this - * event's task is the current task on this cpu. - * - * Can trigger due to concurrent perf_event_context_sched_out() - * flipping contexts around. - */ - if (ctx->task && cpuctx->task_ctx != ctx) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - - /* - * If the event is on, turn it off. - * If it is in error state, leave it in error state. - */ - if (event->state >= PERF_EVENT_STATE_INACTIVE) { - update_context_time(ctx); - update_cgrp_time_from_event(event); - update_group_times(event); - if (event == event->group_leader) - group_sched_out(event, cpuctx, ctx); - else - event_sched_out(event, cpuctx, ctx); - event->state = PERF_EVENT_STATE_OFF; - } - - raw_spin_unlock(&ctx->lock); - - return 0; -} - -void ___perf_event_disable(void *info) +static void __perf_event_disable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; + if (event->state < PERF_EVENT_STATE_INACTIVE) + return; - /* - * Since we have the lock this context can't be scheduled - * in, so we can change the state safely. - */ - if (event->state == PERF_EVENT_STATE_INACTIVE) { - update_group_times(event); - event->state = PERF_EVENT_STATE_OFF; - } + update_context_time(ctx); + update_cgrp_time_from_event(event); + update_group_times(event); + if (event == event->group_leader) + group_sched_out(event, cpuctx, ctx); + else + event_sched_out(event, cpuctx, ctx); + event->state = PERF_EVENT_STATE_OFF; } /* @@ -1788,7 +1818,8 @@ void ___perf_event_disable(void *info) * remains valid. This condition is satisifed when called through * perf_event_for_each_child or perf_event_for_each because they * hold the top-level event's child_mutex, so any descendant that - * goes to exit will block in sync_child_event. + * goes to exit will block in perf_event_exit_event(). + * * When called from perf_pending_event it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. @@ -1804,8 +1835,12 @@ static void _perf_event_disable(struct perf_event *event) } raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_disable, - ___perf_event_disable, event); + event_function_call(event, __perf_event_disable, NULL); +} + +void perf_event_disable_local(struct perf_event *event) +{ + event_function_local(event, __perf_event_disable, NULL); } /* @@ -1918,9 +1953,6 @@ event_sched_in(struct perf_event *event, if (event->attr.exclusive) cpuctx->exclusive = 1; - if (is_orphaned_child(event)) - schedule_orphans_remove(ctx); - out: perf_pmu_enable(event->pmu); @@ -2039,13 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event, event->tstamp_stopped = tstamp; } -static void task_ctx_sched_out(struct perf_event_context *ctx); +static void ctx_sched_out(struct perf_event_context *ctx, + struct perf_cpu_context *cpuctx, + enum event_type_t event_type); static void ctx_sched_in(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type, struct task_struct *task); +static void task_ctx_sched_out(struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx) +{ + if (!cpuctx->task_ctx) + return; + + if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) + return; + + ctx_sched_out(ctx, cpuctx, EVENT_ALL); +} + static void perf_event_sched_in(struct perf_cpu_context *cpuctx, struct perf_event_context *ctx, struct task_struct *task) @@ -2058,22 +2104,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx, ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task); } -static void ___perf_install_in_context(void *info) +static void ctx_resched(struct perf_cpu_context *cpuctx, + struct perf_event_context *task_ctx) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; - - /* - * Since the task isn't running, its safe to add the event, us holding - * the ctx->lock ensures the task won't get scheduled in. - */ - add_event_to_ctx(event, ctx); + perf_pmu_disable(cpuctx->ctx.pmu); + if (task_ctx) + task_ctx_sched_out(cpuctx, task_ctx); + cpu_ctx_sched_out(cpuctx, EVENT_ALL); + perf_event_sched_in(cpuctx, task_ctx, current); + perf_pmu_enable(cpuctx->ctx.pmu); } /* * Cross CPU call to install and enable a performance event * - * Must be called with ctx->mutex held + * Very similar to remote_function() + event_function() but cannot assume that + * things like ctx->is_active and cpuctx->task_ctx are set. */ static int __perf_install_in_context(void *info) { @@ -2081,79 +2127,106 @@ static int __perf_install_in_context(void *info) struct perf_event_context *ctx = event->ctx; struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); struct perf_event_context *task_ctx = cpuctx->task_ctx; - struct task_struct *task = current; - - perf_ctx_lock(cpuctx, task_ctx); - perf_pmu_disable(cpuctx->ctx.pmu); - - /* - * If there was an active task_ctx schedule it out. - */ - if (task_ctx) - task_ctx_sched_out(task_ctx); + bool activate = true; + int ret = 0; - /* - * If the context we're installing events in is not the - * active task_ctx, flip them. - */ - if (ctx->task && task_ctx != ctx) { - if (task_ctx) - raw_spin_unlock(&task_ctx->lock); + raw_spin_lock(&cpuctx->ctx.lock); + if (ctx->task) { raw_spin_lock(&ctx->lock); task_ctx = ctx; - } - - if (task_ctx) { - cpuctx->task_ctx = task_ctx; - task = task_ctx->task; - } - cpu_ctx_sched_out(cpuctx, EVENT_ALL); + /* If we're on the wrong CPU, try again */ + if (task_cpu(ctx->task) != smp_processor_id()) { + ret = -ESRCH; + goto unlock; + } - update_context_time(ctx); - /* - * update cgrp time only if current cgrp - * matches event->cgrp. Must be done before - * calling add_event_to_ctx() - */ - update_cgrp_time_from_event(event); + /* + * If we're on the right CPU, see if the task we target is + * current, if not we don't have to activate the ctx, a future + * context switch will do that for us. + */ + if (ctx->task != current) + activate = false; + else + WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx); - add_event_to_ctx(event, ctx); + } else if (task_ctx) { + raw_spin_lock(&task_ctx->lock); + } - /* - * Schedule everything back in - */ - perf_event_sched_in(cpuctx, task_ctx, task); + if (activate) { + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + add_event_to_ctx(event, ctx); + ctx_resched(cpuctx, task_ctx); + } else { + add_event_to_ctx(event, ctx); + } - perf_pmu_enable(cpuctx->ctx.pmu); +unlock: perf_ctx_unlock(cpuctx, task_ctx); - return 0; + return ret; } /* - * Attach a performance event to a context - * - * First we add the event to the list with the hardware enable bit - * in event->hw_config cleared. + * Attach a performance event to a context. * - * If the event is attached to a task which is on a CPU we use a smp - * call to enable it in the task context. The task might have been - * scheduled away, but we check this in the smp call again. + * Very similar to event_function_call, see comment there. */ static void perf_install_in_context(struct perf_event_context *ctx, struct perf_event *event, int cpu) { + struct task_struct *task = READ_ONCE(ctx->task); + lockdep_assert_held(&ctx->mutex); event->ctx = ctx; if (event->cpu != -1) event->cpu = cpu; - event_function_call(event, __perf_install_in_context, - ___perf_install_in_context, event); + if (!task) { + cpu_function_call(cpu, __perf_install_in_context, event); + return; + } + + /* + * Should not happen, we validate the ctx is still alive before calling. + */ + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) + return; + + /* + * Installing events is tricky because we cannot rely on ctx->is_active + * to be set in case this is the nr_events 0 -> 1 transition. + */ +again: + /* + * Cannot use task_function_call() because we need to run on the task's + * CPU regardless of whether its current or not. + */ + if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event)) + return; + + raw_spin_lock_irq(&ctx->lock); + task = ctx->task; + if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) { + /* + * Cannot happen because we already checked above (which also + * cannot happen), and we hold ctx->mutex, which serializes us + * against perf_event_exit_task_context(). + */ + raw_spin_unlock_irq(&ctx->lock); + return; + } + raw_spin_unlock_irq(&ctx->lock); + /* + * Since !ctx->is_active doesn't mean anything, we must IPI + * unconditionally. + */ + goto again; } /* @@ -2180,85 +2253,47 @@ static void __perf_event_mark_enabled(struct perf_event *event) /* * Cross CPU call to enable a performance event */ -static int __perf_event_enable(void *info) +static void __perf_event_enable(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct perf_event *event = info; - struct perf_event_context *ctx = event->ctx; struct perf_event *leader = event->group_leader; - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - int err; + struct perf_event_context *task_ctx; - /* - * There's a time window between 'ctx->is_active' check - * in perf_event_enable function and this place having: - * - IRQs on - * - ctx->lock unlocked - * - * where the task could be killed and 'ctx' deactivated - * by perf_event_exit_task. - */ - if (!ctx->is_active) - return -EINVAL; - - raw_spin_lock(&ctx->lock); - update_context_time(ctx); + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state <= PERF_EVENT_STATE_ERROR) + return; - if (event->state >= PERF_EVENT_STATE_INACTIVE) - goto unlock; - - /* - * set current task's cgroup time reference point - */ - perf_cgroup_set_timestamp(current, ctx); + if (ctx->is_active) + ctx_sched_out(ctx, cpuctx, EVENT_TIME); __perf_event_mark_enabled(event); + if (!ctx->is_active) + return; + if (!event_filter_match(event)) { if (is_cgroup_event(event)) perf_cgroup_defer_enabled(event); - goto unlock; + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + return; } /* * If the event is in a group and isn't the group leader, * then don't put it on unless the group is on. */ - if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) - goto unlock; - - if (!group_can_go_on(event, cpuctx, 1)) { - err = -EEXIST; - } else { - if (event == leader) - err = group_sched_in(event, cpuctx, ctx); - else - err = event_sched_in(event, cpuctx, ctx); - } - - if (err) { - /* - * If this event can't go on and it's part of a - * group, then the whole group has to come off. - */ - if (leader != event) { - group_sched_out(leader, cpuctx, ctx); - perf_mux_hrtimer_restart(cpuctx); - } - if (leader->attr.pinned) { - update_group_times(leader); - leader->state = PERF_EVENT_STATE_ERROR; - } + if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) { + ctx_sched_in(ctx, cpuctx, EVENT_TIME, current); + return; } -unlock: - raw_spin_unlock(&ctx->lock); - - return 0; -} + task_ctx = cpuctx->task_ctx; + if (ctx->task) + WARN_ON_ONCE(task_ctx != ctx); -void ___perf_event_enable(void *info) -{ - __perf_event_mark_enabled((struct perf_event *)info); + ctx_resched(cpuctx, task_ctx); } /* @@ -2275,7 +2310,8 @@ static void _perf_event_enable(struct perf_event *event) struct perf_event_context *ctx = event->ctx; raw_spin_lock_irq(&ctx->lock); - if (event->state >= PERF_EVENT_STATE_INACTIVE) { + if (event->state >= PERF_EVENT_STATE_INACTIVE || + event->state < PERF_EVENT_STATE_ERROR) { raw_spin_unlock_irq(&ctx->lock); return; } @@ -2291,8 +2327,7 @@ static void _perf_event_enable(struct perf_event *event) event->state = PERF_EVENT_STATE_OFF; raw_spin_unlock_irq(&ctx->lock); - event_function_call(event, __perf_event_enable, - ___perf_event_enable, event); + event_function_call(event, __perf_event_enable, NULL); } /* @@ -2342,25 +2377,49 @@ static void ctx_sched_out(struct perf_event_context *ctx, struct perf_cpu_context *cpuctx, enum event_type_t event_type) { - struct perf_event *event; int is_active = ctx->is_active; + struct perf_event *event; - ctx->is_active &= ~event_type; - if (likely(!ctx->nr_events)) + lockdep_assert_held(&ctx->lock); + + if (likely(!ctx->nr_events)) { + /* + * See __perf_remove_from_context(). + */ + WARN_ON_ONCE(ctx->is_active); + if (ctx->task) + WARN_ON_ONCE(cpuctx->task_ctx); return; + } - update_context_time(ctx); - update_cgrp_time_from_cpuctx(cpuctx); - if (!ctx->nr_active) + ctx->is_active &= ~event_type; + if (!(ctx->is_active & EVENT_ALL)) + ctx->is_active = 0; + + if (ctx->task) { + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + if (!ctx->is_active) + cpuctx->task_ctx = NULL; + } + + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* update (and stop) ctx time */ + update_context_time(ctx); + update_cgrp_time_from_cpuctx(cpuctx); + } + + if (!ctx->nr_active || !(is_active & EVENT_ALL)) return; perf_pmu_disable(ctx->pmu); - if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) { + if (is_active & EVENT_PINNED) { list_for_each_entry(event, &ctx->pinned_groups, group_entry) group_sched_out(event, cpuctx, ctx); } - if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) { + if (is_active & EVENT_FLEXIBLE) { list_for_each_entry(event, &ctx->flexible_groups, group_entry) group_sched_out(event, cpuctx, ctx); } @@ -2518,17 +2577,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn, raw_spin_lock(&ctx->lock); raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING); if (context_equiv(ctx, next_ctx)) { - /* - * XXX do we need a memory barrier of sorts - * wrt to rcu_dereference() of perf_event_ctxp - */ - task->perf_event_ctxp[ctxn] = next_ctx; - next->perf_event_ctxp[ctxn] = ctx; - ctx->task = next; - next_ctx->task = task; + WRITE_ONCE(ctx->task, next); + WRITE_ONCE(next_ctx->task, task); swap(ctx->task_ctx_data, next_ctx->task_ctx_data); + /* + * RCU_INIT_POINTER here is safe because we've not + * modified the ctx and the above modification of + * ctx->task and ctx->task_ctx_data are immaterial + * since those values are always verified under + * ctx->lock which we're now holding. + */ + RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx); + RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx); + do_switch = 0; perf_event_sync_stat(ctx, next_ctx); @@ -2541,8 +2604,7 @@ unlock: if (do_switch) { raw_spin_lock(&ctx->lock); - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; + task_ctx_sched_out(cpuctx, ctx); raw_spin_unlock(&ctx->lock); } } @@ -2637,20 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task, perf_cgroup_sched_out(task, next); } -static void task_ctx_sched_out(struct perf_event_context *ctx) -{ - struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); - - if (!cpuctx->task_ctx) - return; - - if (WARN_ON_ONCE(ctx != cpuctx->task_ctx)) - return; - - ctx_sched_out(ctx, cpuctx, EVENT_ALL); - cpuctx->task_ctx = NULL; -} - /* * Called with IRQs disabled */ @@ -2725,25 +2773,40 @@ ctx_sched_in(struct perf_event_context *ctx, enum event_type_t event_type, struct task_struct *task) { - u64 now; int is_active = ctx->is_active; + u64 now; + + lockdep_assert_held(&ctx->lock); - ctx->is_active |= event_type; if (likely(!ctx->nr_events)) return; - now = perf_clock(); - ctx->timestamp = now; - perf_cgroup_set_timestamp(task, ctx); + ctx->is_active |= (event_type | EVENT_TIME); + if (ctx->task) { + if (!is_active) + cpuctx->task_ctx = ctx; + else + WARN_ON_ONCE(cpuctx->task_ctx != ctx); + } + + is_active ^= ctx->is_active; /* changed bits */ + + if (is_active & EVENT_TIME) { + /* start ctx time */ + now = perf_clock(); + ctx->timestamp = now; + perf_cgroup_set_timestamp(task, ctx); + } + /* * First go through the list and put on any pinned groups * in order to give them the best chance of going on. */ - if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) + if (is_active & EVENT_PINNED) ctx_pinned_sched_in(ctx, cpuctx); /* Then walk through the lower prio flexible groups */ - if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) + if (is_active & EVENT_FLEXIBLE) ctx_flexible_sched_in(ctx, cpuctx); } @@ -2773,12 +2836,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx, * cpu flexible, task flexible. */ cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE); - - if (ctx->nr_events) - cpuctx->task_ctx = ctx; - - perf_event_sched_in(cpuctx, cpuctx->task_ctx, task); - + perf_event_sched_in(cpuctx, ctx, task); perf_pmu_enable(ctx->pmu); perf_ctx_unlock(cpuctx, ctx); } @@ -2800,6 +2858,16 @@ void __perf_event_task_sched_in(struct task_struct *prev, struct perf_event_context *ctx; int ctxn; + /* + * If cgroup events exist on this CPU, then we need to check if we have + * to switch in PMU state; cgroup event are system-wide mode only. + * + * Since cgroup events are CPU events, we must schedule these in before + * we schedule in the task events. + */ + if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) + perf_cgroup_sched_in(prev, task); + for_each_task_context_nr(ctxn) { ctx = task->perf_event_ctxp[ctxn]; if (likely(!ctx)) @@ -2807,13 +2875,6 @@ void __perf_event_task_sched_in(struct task_struct *prev, perf_event_context_sched_in(ctx, task); } - /* - * if cgroup events exist on this CPU, then we need - * to check if we have to switch in PMU state. - * cgroup event are system-wide mode only - */ - if (atomic_read(this_cpu_ptr(&perf_cgroup_events))) - perf_cgroup_sched_in(prev, task); if (atomic_read(&nr_switch_events)) perf_event_switch(task, prev, true); @@ -3099,46 +3160,31 @@ static int event_enable_on_exec(struct perf_event *event, static void perf_event_enable_on_exec(int ctxn) { struct perf_event_context *ctx, *clone_ctx = NULL; + struct perf_cpu_context *cpuctx; struct perf_event *event; unsigned long flags; int enabled = 0; - int ret; local_irq_save(flags); ctx = current->perf_event_ctxp[ctxn]; if (!ctx || !ctx->nr_events) goto out; - /* - * We must ctxsw out cgroup events to avoid conflict - * when invoking perf_task_event_sched_in() later on - * in this function. Otherwise we end up trying to - * ctxswin cgroup events which are already scheduled - * in. - */ - perf_cgroup_sched_out(current, NULL); - - raw_spin_lock(&ctx->lock); - task_ctx_sched_out(ctx); - - list_for_each_entry(event, &ctx->event_list, event_entry) { - ret = event_enable_on_exec(event, ctx); - if (ret) - enabled = 1; - } + cpuctx = __get_cpu_context(ctx); + perf_ctx_lock(cpuctx, ctx); + ctx_sched_out(ctx, cpuctx, EVENT_TIME); + list_for_each_entry(event, &ctx->event_list, event_entry) + enabled |= event_enable_on_exec(event, ctx); /* - * Unclone this context if we enabled any event. + * Unclone and reschedule this context if we enabled any event. */ - if (enabled) + if (enabled) { clone_ctx = unclone_ctx(ctx); + ctx_resched(cpuctx, ctx); + } + perf_ctx_unlock(cpuctx, ctx); - raw_spin_unlock(&ctx->lock); - - /* - * Also calls ctxswin for cgroup events, if any: - */ - perf_event_context_sched_in(ctx, ctx->task); out: local_irq_restore(flags); @@ -3334,7 +3380,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx) INIT_LIST_HEAD(&ctx->flexible_groups); INIT_LIST_HEAD(&ctx->event_list); atomic_set(&ctx->refcount, 1); - INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work); } static struct perf_event_context * @@ -3521,11 +3566,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu) static void unaccount_event(struct perf_event *event) { + bool dec = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (event->attr.mmap || event->attr.mmap_data) atomic_dec(&nr_mmap_events); if (event->attr.comm) @@ -3535,17 +3582,30 @@ static void unaccount_event(struct perf_event *event) if (event->attr.freq) atomic_dec(&nr_freq_events); if (event->attr.context_switch) { - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; atomic_dec(&nr_switch_events); } if (is_cgroup_event(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; if (has_branch_stack(event)) - static_key_slow_dec_deferred(&perf_sched_events); + dec = true; + + if (dec) { + if (!atomic_add_unless(&perf_sched_count, -1, 1)) + schedule_delayed_work(&perf_sched_work, HZ); + } unaccount_event_cpu(event, event->cpu); } +static void perf_sched_delayed(struct work_struct *work) +{ + mutex_lock(&perf_sched_mutex); + if (atomic_dec_and_test(&perf_sched_count)) + static_branch_disable(&perf_sched_events); + mutex_unlock(&perf_sched_mutex); +} + /* * The following implement mutual exclusion of events on "exclusive" pmus * (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled @@ -3556,7 +3616,7 @@ static void unaccount_event(struct perf_event *event) * 3) two matching events on the same context. * * The former two cases are handled in the allocation path (perf_event_alloc(), - * __free_event()), the latter -- before the first perf_install_in_context(). + * _free_event()), the latter -- before the first perf_install_in_context(). */ static int exclusive_event_init(struct perf_event *event) { @@ -3631,29 +3691,6 @@ static bool exclusive_event_installable(struct perf_event *event, return true; } -static void __free_event(struct perf_event *event) -{ - if (!event->parent) { - if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) - put_callchain_buffers(); - } - - perf_event_free_bpf_prog(event); - - if (event->destroy) - event->destroy(event); - - if (event->ctx) - put_ctx(event->ctx); - - if (event->pmu) { - exclusive_event_destroy(event); - module_put(event->pmu->module); - } - - call_rcu(&event->rcu_head, free_event_rcu); -} - static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending); @@ -3675,7 +3712,25 @@ static void _free_event(struct perf_event *event) if (is_cgroup_event(event)) perf_detach_cgroup(event); - __free_event(event); + if (!event->parent) { + if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) + put_callchain_buffers(); + } + + perf_event_free_bpf_prog(event); + + if (event->destroy) + event->destroy(event); + + if (event->ctx) + put_ctx(event->ctx); + + if (event->pmu) { + exclusive_event_destroy(event); + module_put(event->pmu->module); + } + + call_rcu(&event->rcu_head, free_event_rcu); } /* @@ -3702,14 +3757,13 @@ static void perf_remove_from_owner(struct perf_event *event) struct task_struct *owner; rcu_read_lock(); - owner = ACCESS_ONCE(event->owner); /* - * Matches the smp_wmb() in perf_event_exit_task(). If we observe - * !owner it means the list deletion is complete and we can indeed - * free this event, otherwise we need to serialize on + * Matches the smp_store_release() in perf_event_exit_task(). If we + * observe !owner it means the list deletion is complete and we can + * indeed free this event, otherwise we need to serialize on * owner->perf_event_mutex. */ - smp_read_barrier_depends(); + owner = lockless_dereference(event->owner); if (owner) { /* * Since delayed_put_task_struct() also drops the last @@ -3737,8 +3791,10 @@ static void perf_remove_from_owner(struct perf_event *event) * ensured they're done, and we can proceed with freeing the * event. */ - if (event->owner) + if (event->owner) { list_del_init(&event->owner_entry); + smp_store_release(&event->owner, NULL); + } mutex_unlock(&owner->perf_event_mutex); put_task_struct(owner); } @@ -3746,37 +3802,111 @@ static void perf_remove_from_owner(struct perf_event *event) static void put_event(struct perf_event *event) { - struct perf_event_context *ctx; - if (!atomic_long_dec_and_test(&event->refcount)) return; + _free_event(event); +} + +/* + * Kill an event dead; while event:refcount will preserve the event + * object, it will not preserve its functionality. Once the last 'user' + * gives up the object, we'll destroy the thing. + */ +int perf_event_release_kernel(struct perf_event *event) +{ + struct perf_event_context *ctx = event->ctx; + struct perf_event *child, *tmp; + + /* + * If we got here through err_file: fput(event_file); we will not have + * attached to a context yet. + */ + if (!ctx) { + WARN_ON_ONCE(event->attach_state & + (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP)); + goto no_ctx; + } + if (!is_kernel_event(event)) perf_remove_from_owner(event); + ctx = perf_event_ctx_lock(event); + WARN_ON_ONCE(ctx->parent_ctx); + perf_remove_from_context(event, DETACH_GROUP); + + raw_spin_lock_irq(&ctx->lock); /* - * There are two ways this annotation is useful: + * Mark this even as STATE_DEAD, there is no external reference to it + * anymore. * - * 1) there is a lock recursion from perf_event_exit_task - * see the comment there. + * Anybody acquiring event->child_mutex after the below loop _must_ + * also see this, most importantly inherit_event() which will avoid + * placing more children on the list. * - * 2) there is a lock-inversion with mmap_sem through - * perf_read_group(), which takes faults while - * holding ctx->mutex, however this is called after - * the last filedesc died, so there is no possibility - * to trigger the AB-BA case. + * Thus this guarantees that we will in fact observe and kill _ALL_ + * child events. */ - ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING); - WARN_ON_ONCE(ctx->parent_ctx); - perf_remove_from_context(event, true); + event->state = PERF_EVENT_STATE_DEAD; + raw_spin_unlock_irq(&ctx->lock); + perf_event_ctx_unlock(event, ctx); - _free_event(event); -} +again: + mutex_lock(&event->child_mutex); + list_for_each_entry(child, &event->child_list, child_list) { -int perf_event_release_kernel(struct perf_event *event) -{ - put_event(event); + /* + * Cannot change, child events are not migrated, see the + * comment with perf_event_ctx_lock_nested(). + */ + ctx = lockless_dereference(child->ctx); + /* + * Since child_mutex nests inside ctx::mutex, we must jump + * through hoops. We start by grabbing a reference on the ctx. + * + * Since the event cannot get freed while we hold the + * child_mutex, the context must also exist and have a !0 + * reference count. + */ + get_ctx(ctx); + + /* + * Now that we have a ctx ref, we can drop child_mutex, and + * acquire ctx::mutex without fear of it going away. Then we + * can re-acquire child_mutex. + */ + mutex_unlock(&event->child_mutex); + mutex_lock(&ctx->mutex); + mutex_lock(&event->child_mutex); + + /* + * Now that we hold ctx::mutex and child_mutex, revalidate our + * state, if child is still the first entry, it didn't get freed + * and we can continue doing so. + */ + tmp = list_first_entry_or_null(&event->child_list, + struct perf_event, child_list); + if (tmp == child) { + perf_remove_from_context(child, DETACH_GROUP); + list_del(&child->child_list); + free_event(child); + /* + * This matches the refcount bump in inherit_event(); + * this can't be the last reference. + */ + put_event(event); + } + + mutex_unlock(&event->child_mutex); + mutex_unlock(&ctx->mutex); + put_ctx(ctx); + goto again; + } + mutex_unlock(&event->child_mutex); + +no_ctx: + put_event(event); /* Must be the 'last' reference */ return 0; } EXPORT_SYMBOL_GPL(perf_event_release_kernel); @@ -3786,46 +3916,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel); */ static int perf_release(struct inode *inode, struct file *file) { - put_event(file->private_data); + perf_event_release_kernel(file->private_data); return 0; } -/* - * Remove all orphanes events from the context. - */ -static void orphans_remove_work(struct work_struct *work) -{ - struct perf_event_context *ctx; - struct perf_event *event, *tmp; - - ctx = container_of(work, struct perf_event_context, - orphans_remove.work); - - mutex_lock(&ctx->mutex); - list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) { - struct perf_event *parent_event = event->parent; - - if (!is_orphaned_child(event)) - continue; - - perf_remove_from_context(event, true); - - mutex_lock(&parent_event->child_mutex); - list_del_init(&event->child_list); - mutex_unlock(&parent_event->child_mutex); - - free_event(event); - put_event(parent_event); - } - - raw_spin_lock_irq(&ctx->lock); - ctx->orphans_remove_sched = false; - raw_spin_unlock_irq(&ctx->lock); - mutex_unlock(&ctx->mutex); - - put_ctx(ctx); -} - u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running) { struct perf_event *child; @@ -3969,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event) { bool no_children; - if (event->state != PERF_EVENT_STATE_EXIT) + if (event->state > PERF_EVENT_STATE_EXIT) return false; mutex_lock(&event->child_mutex); @@ -4054,7 +4148,7 @@ static void _perf_event_reset(struct perf_event *event) /* * Holding the top-level event's child_mutex means that any * descendant process that has inherited this event will block - * in sync_child_event if it goes to exit, thus satisfying the + * in perf_event_exit_event() if it goes to exit, thus satisfying the * task existence requirements of perf_event_enable/disable. */ static void perf_event_for_each_child(struct perf_event *event, @@ -4086,36 +4180,14 @@ static void perf_event_for_each(struct perf_event *event, perf_event_for_each_child(sibling, func); } -struct period_event { - struct perf_event *event; - u64 value; -}; - -static void ___perf_event_period(void *info) -{ - struct period_event *pe = info; - struct perf_event *event = pe->event; - u64 value = pe->value; - - if (event->attr.freq) { - event->attr.sample_freq = value; - } else { - event->attr.sample_period = value; - event->hw.sample_period = value; - } - - local64_set(&event->hw.period_left, 0); -} - -static int __perf_event_period(void *info) +static void __perf_event_period(struct perf_event *event, + struct perf_cpu_context *cpuctx, + struct perf_event_context *ctx, + void *info) { - struct period_event *pe = info; - struct perf_event *event = pe->event; - struct perf_event_context *ctx = event->ctx; - u64 value = pe->value; + u64 value = *((u64 *)info); bool active; - raw_spin_lock(&ctx->lock); if (event->attr.freq) { event->attr.sample_freq = value; } else { @@ -4135,14 +4207,10 @@ static int __perf_event_period(void *info) event->pmu->start(event, PERF_EF_RELOAD); perf_pmu_enable(ctx->pmu); } - raw_spin_unlock(&ctx->lock); - - return 0; } static int perf_event_period(struct perf_event *event, u64 __user *arg) { - struct period_event pe = { .event = event, }; u64 value; if (!is_sampling_event(event)) @@ -4157,10 +4225,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg) if (event->attr.freq && value > sysctl_perf_event_sample_rate) return -EINVAL; - pe.value = value; - - event_function_call(event, __perf_event_period, - ___perf_event_period, &pe); + event_function_call(event, __perf_event_period, &value); return 0; } @@ -4932,7 +4997,7 @@ static void perf_pending_event(struct irq_work *entry) if (event->pending_disable) { event->pending_disable = 0; - __perf_event_disable(event); + perf_event_disable_local(event); } if (event->pending_wakeup) { @@ -7753,11 +7818,13 @@ static void account_event_cpu(struct perf_event *event, int cpu) static void account_event(struct perf_event *event) { + bool inc = false; + if (event->parent) return; if (event->attach_state & PERF_ATTACH_TASK) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (event->attr.mmap || event->attr.mmap_data) atomic_inc(&nr_mmap_events); if (event->attr.comm) @@ -7770,12 +7837,35 @@ static void account_event(struct perf_event *event) } if (event->attr.context_switch) { atomic_inc(&nr_switch_events); - static_key_slow_inc(&perf_sched_events.key); + inc = true; } if (has_branch_stack(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; if (is_cgroup_event(event)) - static_key_slow_inc(&perf_sched_events.key); + inc = true; + + if (inc) { + if (atomic_inc_not_zero(&perf_sched_count)) + goto enabled; + + mutex_lock(&perf_sched_mutex); + if (!atomic_read(&perf_sched_count)) { + static_branch_enable(&perf_sched_events); + /* + * Guarantee that all CPUs observe they key change and + * call the perf scheduling hooks before proceeding to + * install events that need them. + */ + synchronize_sched(); + } + /* + * Now that we have waited for the sync_sched(), allow further + * increments to by-pass the mutex. + */ + atomic_inc(&perf_sched_count); + mutex_unlock(&perf_sched_mutex); + } +enabled: account_event_cpu(event, event->cpu); } @@ -8394,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open, if (move_group) { gctx = group_leader->ctx; mutex_lock_double(&gctx->mutex, &ctx->mutex); + if (gctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } } else { mutex_lock(&ctx->mutex); } + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_locked; + } + if (!perf_event_validate_size(event)) { err = -E2BIG; goto err_locked; @@ -8422,11 +8521,11 @@ SYSCALL_DEFINE5(perf_event_open, * See perf_event_ctx_lock() for comments on the details * of swizzling perf_event::ctx. */ - perf_remove_from_context(group_leader, false); + perf_remove_from_context(group_leader, 0); list_for_each_entry(sibling, &group_leader->sibling_list, group_entry) { - perf_remove_from_context(sibling, false); + perf_remove_from_context(sibling, 0); put_ctx(gctx); } @@ -8479,6 +8578,8 @@ SYSCALL_DEFINE5(perf_event_open, perf_event__header_size(event); perf_event__id_header_size(event); + event->owner = current; + perf_install_in_context(ctx, event, event->cpu); perf_unpin_context(ctx); @@ -8488,8 +8589,6 @@ SYSCALL_DEFINE5(perf_event_open, put_online_cpus(); - event->owner = current; - mutex_lock(¤t->perf_event_mutex); list_add_tail(&event->owner_entry, ¤t->perf_event_list); mutex_unlock(¤t->perf_event_mutex); @@ -8514,7 +8613,12 @@ err_context: perf_unpin_context(ctx); put_ctx(ctx); err_alloc: - free_event(event); + /* + * If event_file is set, the fput() above will have called ->release() + * and that will take care of freeing the event. + */ + if (!event_file) + free_event(event); err_cpus: put_online_cpus(); err_task: @@ -8556,7 +8660,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } /* Mark owner so we could distinguish it from user events. */ - event->owner = EVENT_OWNER_KERNEL; + event->owner = TASK_TOMBSTONE; account_event(event); @@ -8568,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, WARN_ON_ONCE(ctx->parent_ctx); mutex_lock(&ctx->mutex); + if (ctx->task == TASK_TOMBSTONE) { + err = -ESRCH; + goto err_unlock; + } + if (!exclusive_event_installable(event, ctx)) { - mutex_unlock(&ctx->mutex); - perf_unpin_context(ctx); - put_ctx(ctx); err = -EBUSY; - goto err_free; + goto err_unlock; } perf_install_in_context(ctx, event, cpu); @@ -8582,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, return event; +err_unlock: + mutex_unlock(&ctx->mutex); + perf_unpin_context(ctx); + put_ctx(ctx); err_free: free_event(event); err: @@ -8606,7 +8716,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu) mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex); list_for_each_entry_safe(event, tmp, &src_ctx->event_list, event_entry) { - perf_remove_from_context(event, false); + perf_remove_from_context(event, 0); unaccount_event_cpu(event, src_cpu); put_ctx(src_ctx); list_add(&event->migrate_entry, &events); @@ -8673,33 +8783,15 @@ static void sync_child_event(struct perf_event *child_event, &parent_event->child_total_time_enabled); atomic64_add(child_event->total_time_running, &parent_event->child_total_time_running); - - /* - * Remove this event from the parent's list - */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); - list_del_init(&child_event->child_list); - mutex_unlock(&parent_event->child_mutex); - - /* - * Make sure user/parent get notified, that we just - * lost one event. - */ - perf_event_wakeup(parent_event); - - /* - * Release the parent event, if this was the last - * reference to it. - */ - put_event(parent_event); } static void -__perf_event_exit_task(struct perf_event *child_event, - struct perf_event_context *child_ctx, - struct task_struct *child) +perf_event_exit_event(struct perf_event *child_event, + struct perf_event_context *child_ctx, + struct task_struct *child) { + struct perf_event *parent_event = child_event->parent; + /* * Do not destroy the 'original' grouping; because of the context * switch optimization the original events could've ended up in a @@ -8712,57 +8804,86 @@ __perf_event_exit_task(struct perf_event *child_event, * Do destroy all inherited groups, we don't care about those * and being thorough is better. */ - perf_remove_from_context(child_event, !!child_event->parent); + raw_spin_lock_irq(&child_ctx->lock); + WARN_ON_ONCE(child_ctx->is_active); + + if (parent_event) + perf_group_detach(child_event); + list_del_event(child_event, child_ctx); + child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */ + raw_spin_unlock_irq(&child_ctx->lock); /* - * It can happen that the parent exits first, and has events - * that are still around due to the child reference. These - * events need to be zapped. + * Parent events are governed by their filedesc, retain them. */ - if (child_event->parent) { - sync_child_event(child_event, child); - free_event(child_event); - } else { - child_event->state = PERF_EVENT_STATE_EXIT; + if (!parent_event) { perf_event_wakeup(child_event); + return; } + /* + * Child events can be cleaned up. + */ + + sync_child_event(child_event, child); + + /* + * Remove this event from the parent's list + */ + WARN_ON_ONCE(parent_event->ctx->parent_ctx); + mutex_lock(&parent_event->child_mutex); + list_del_init(&child_event->child_list); + mutex_unlock(&parent_event->child_mutex); + + /* + * Kick perf_poll() for is_event_hup(). + */ + perf_event_wakeup(parent_event); + free_event(child_event); + put_event(parent_event); } static void perf_event_exit_task_context(struct task_struct *child, int ctxn) { - struct perf_event *child_event, *next; struct perf_event_context *child_ctx, *clone_ctx = NULL; - unsigned long flags; + struct perf_event *child_event, *next; - if (likely(!child->perf_event_ctxp[ctxn])) + WARN_ON_ONCE(child != current); + + child_ctx = perf_pin_task_context(child, ctxn); + if (!child_ctx) return; - local_irq_save(flags); /* - * We can't reschedule here because interrupts are disabled, - * and either child is current or it is a task that can't be - * scheduled, so we are now safe from rescheduling changing - * our context. + * In order to reduce the amount of tricky in ctx tear-down, we hold + * ctx::mutex over the entire thing. This serializes against almost + * everything that wants to access the ctx. + * + * The exception is sys_perf_event_open() / + * perf_event_create_kernel_count() which does find_get_context() + * without ctx::mutex (it cannot because of the move_group double mutex + * lock thing). See the comments in perf_install_in_context(). */ - child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]); + mutex_lock(&child_ctx->mutex); /* - * Take the context lock here so that if find_get_context is - * reading child->perf_event_ctxp, we wait until it has - * incremented the context's refcount before we do put_ctx below. + * In a single ctx::lock section, de-schedule the events and detach the + * context from the task such that we cannot ever get it scheduled back + * in. */ - raw_spin_lock(&child_ctx->lock); - task_ctx_sched_out(child_ctx); - child->perf_event_ctxp[ctxn] = NULL; + raw_spin_lock_irq(&child_ctx->lock); + task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx); /* - * If this context is a clone; unclone it so it can't get - * swapped to another process while we're removing all - * the events from it. + * Now that the context is inactive, destroy the task <-> ctx relation + * and mark the context dead. */ + RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL); + put_ctx(child_ctx); /* cannot be last */ + WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE); + put_task_struct(current); /* cannot be last */ + clone_ctx = unclone_ctx(child_ctx); - update_context_time(child_ctx); - raw_spin_unlock_irqrestore(&child_ctx->lock, flags); + raw_spin_unlock_irq(&child_ctx->lock); if (clone_ctx) put_ctx(clone_ctx); @@ -8774,20 +8895,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn) */ perf_event_task(child, child_ctx, 0); - /* - * We can recurse on the same lock type through: - * - * __perf_event_exit_task() - * sync_child_event() - * put_event() - * mutex_lock(&ctx->mutex) - * - * But since its the parent context it won't be the same instance. - */ - mutex_lock(&child_ctx->mutex); - list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry) - __perf_event_exit_task(child_event, child_ctx, child); + perf_event_exit_event(child_event, child_ctx, child); mutex_unlock(&child_ctx->mutex); @@ -8812,8 +8921,7 @@ void perf_event_exit_task(struct task_struct *child) * the owner, closes a race against perf_release() where * we need to serialize on the owner->perf_event_mutex. */ - smp_wmb(); - event->owner = NULL; + smp_store_release(&event->owner, NULL); } mutex_unlock(&child->perf_event_mutex); @@ -8896,21 +9004,20 @@ void perf_event_delayed_put(struct task_struct *task) WARN_ON_ONCE(task->perf_event_ctxp[ctxn]); } -struct perf_event *perf_event_get(unsigned int fd) +struct file *perf_event_get(unsigned int fd) { - int err; - struct fd f; - struct perf_event *event; + struct file *file; - err = perf_fget_light(fd, &f); - if (err) - return ERR_PTR(err); + file = fget_raw(fd); + if (!file) + return ERR_PTR(-EBADF); - event = f.file->private_data; - atomic_long_inc(&event->refcount); - fdput(f); + if (file->f_op != &perf_fops) { + fput(file); + return ERR_PTR(-EBADF); + } - return event; + return file; } const struct perf_event_attr *perf_event_attrs(struct perf_event *event) @@ -8953,8 +9060,16 @@ inherit_event(struct perf_event *parent_event, if (IS_ERR(child_event)) return child_event; + /* + * is_orphaned_event() and list_add_tail(&parent_event->child_list) + * must be under the same lock in order to serialize against + * perf_event_release_kernel(), such that either we must observe + * is_orphaned_event() or they will observe us on the child_list. + */ + mutex_lock(&parent_event->child_mutex); if (is_orphaned_event(parent_event) || !atomic_long_inc_not_zero(&parent_event->refcount)) { + mutex_unlock(&parent_event->child_mutex); free_event(child_event); return NULL; } @@ -9002,8 +9117,6 @@ inherit_event(struct perf_event *parent_event, /* * Link this into the parent event's child list */ - WARN_ON_ONCE(parent_event->ctx->parent_ctx); - mutex_lock(&parent_event->child_mutex); list_add_tail(&child_event->child_list, &parent_event->child_list); mutex_unlock(&parent_event->child_mutex); @@ -9208,7 +9321,7 @@ static void perf_event_init_cpu(int cpu) struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu); mutex_lock(&swhash->hlist_mutex); - if (swhash->hlist_refcount > 0) { + if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) { struct swevent_hlist *hlist; hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu)); @@ -9221,13 +9334,14 @@ static void perf_event_init_cpu(int cpu) #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE static void __perf_event_exit_context(void *__info) { - struct remove_event re = { .detach_group = true }; struct perf_event_context *ctx = __info; + struct perf_cpu_context *cpuctx = __get_cpu_context(ctx); + struct perf_event *event; - rcu_read_lock(); - list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry) - __perf_remove_from_context(&re); - rcu_read_unlock(); + raw_spin_lock(&ctx->lock); + list_for_each_entry(event, &ctx->event_list, event_entry) + __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP); + raw_spin_unlock(&ctx->lock); } static void perf_event_exit_cpu_context(int cpu) @@ -9283,11 +9397,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) switch (action & ~CPU_TASKS_FROZEN) { case CPU_UP_PREPARE: - case CPU_DOWN_FAILED: perf_event_init_cpu(cpu); break; - case CPU_UP_CANCELED: case CPU_DOWN_PREPARE: perf_event_exit_cpu(cpu); break; @@ -9316,9 +9428,6 @@ void __init perf_event_init(void) ret = init_hw_breakpoint(); WARN(ret, "hw_breakpoint initialization failed with: %d", ret); - /* do not patch jump label more than once per second */ - jump_label_rate_limit(&perf_sched_events, HZ); - /* * Build time assertion that we keep the data_head at the intended * location. IOW, validation we got the __reserved[] size right. diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index 92ce5f4ccc26..3f8cb1e14588 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att * current task. */ if (irqs_disabled() && bp->ctx && bp->ctx->task == current) - __perf_event_disable(bp); + perf_event_disable_local(bp); else perf_event_disable(bp); diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index adfdc0536117..1faad2cfdb9e 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx) __free_page(page); } +static void __rb_free_aux(struct ring_buffer *rb) +{ + int pg; + + if (rb->aux_priv) { + rb->free_aux(rb->aux_priv); + rb->free_aux = NULL; + rb->aux_priv = NULL; + } + + if (rb->aux_nr_pages) { + for (pg = 0; pg < rb->aux_nr_pages; pg++) + rb_free_aux_page(rb, pg); + + kfree(rb->aux_pages); + rb->aux_nr_pages = 0; + } +} + int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event, pgoff_t pgoff, int nr_pages, long watermark, int flags) { @@ -547,30 +566,11 @@ out: if (!ret) rb->aux_pgoff = pgoff; else - rb_free_aux(rb); + __rb_free_aux(rb); return ret; } -static void __rb_free_aux(struct ring_buffer *rb) -{ - int pg; - - if (rb->aux_priv) { - rb->free_aux(rb->aux_priv); - rb->free_aux = NULL; - rb->aux_priv = NULL; - } - - if (rb->aux_nr_pages) { - for (pg = 0; pg < rb->aux_nr_pages; pg++) - rb_free_aux_page(rb, pg); - - kfree(rb->aux_pages); - rb->aux_nr_pages = 0; - } -} - void rb_free_aux(struct ring_buffer *rb) { if (atomic_dec_and_test(&rb->aux_refcount)) diff --git a/kernel/futex.c b/kernel/futex.c index 0773f2b23b10..5d6ce6413ef1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1191,7 +1191,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (pi_state->owner != current) return -EINVAL; - raw_spin_lock(&pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -1217,22 +1217,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else if (curval != uval) ret = -EINVAL; if (ret) { - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); return ret; } - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); - raw_spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - raw_spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock(&new_owner->pi_lock); - raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); @@ -2127,11 +2127,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) * we returned due to timeout or signal without taking the * rt_mutex. Too late. */ - raw_spin_lock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); owner = rt_mutex_owner(&q->pi_state->pi_mutex); if (!owner) owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock); + raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); ret = fixup_pi_state_owner(uaddr, q, owner); goto out; } diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index a302cf9a2126..57bff7857e87 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -138,7 +138,8 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) unsigned int flags = 0, irq = desc->irq_data.irq; struct irqaction *action = desc->action; - do { + /* action might have become NULL since we dropped the lock */ + while (action) { irqreturn_t res; trace_irq_handler_entry(irq, action); @@ -173,7 +174,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) retval |= res; action = action->next; - } while (action); + } add_interrupt_randomness(irq, flags); diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c index 6e655f7acd3b..3e56d2f03e24 100644 --- a/kernel/irq/irqdomain.c +++ b/kernel/irq/irqdomain.c @@ -575,10 +575,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec) unsigned int type = IRQ_TYPE_NONE; int virq; - if (fwspec->fwnode) - domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY); - else + if (fwspec->fwnode) { + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_WIRED); + if (!domain) + domain = irq_find_matching_fwnode(fwspec->fwnode, + DOMAIN_BUS_ANY); + } else { domain = irq_default_domain; + } if (!domain) { pr_warn("no irq domain found for %s !\n", diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 60ace56618f6..716547fdb873 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes); #define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS) #define classhashentry(key) (classhash_table + __classhashfn((key))) -static struct list_head classhash_table[CLASSHASH_SIZE]; +static struct hlist_head classhash_table[CLASSHASH_SIZE]; /* * We put the lock dependency chains into a hash-table as well, to cache @@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE]; #define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS) #define chainhashentry(chain) (chainhash_table + __chainhashfn((chain))) -static struct list_head chainhash_table[CHAINHASH_SIZE]; +static struct hlist_head chainhash_table[CHAINHASH_SIZE]; /* * The hash key of the lock dependency chains is a hash itself too: @@ -666,7 +666,7 @@ static inline struct lock_class * look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; #ifdef CONFIG_DEBUG_LOCKDEP @@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass) if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) return NULL; - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) { /* * Huh! same key, different name? Did someone trample @@ -742,7 +742,7 @@ static inline struct lock_class * register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) { struct lockdep_subclass_key *key; - struct list_head *hash_head; + struct hlist_head *hash_head; struct lock_class *class; DEBUG_LOCKS_WARN_ON(!irqs_disabled()); @@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We have to do the hash-walk again, to avoid races * with another CPU: */ - list_for_each_entry_rcu(class, hash_head, hash_entry) { + hlist_for_each_entry_rcu(class, hash_head, hash_entry) { if (class->key == key) goto out_unlock_set; } @@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force) * We use RCU's safe list-add method to make * parallel walking of the hash-list safe: */ - list_add_tail_rcu(&class->hash_entry, hash_head); + hlist_add_head_rcu(&class->hash_entry, hash_head); /* * Add it to the global list of classes: */ @@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next, */ static int check_prev_add(struct task_struct *curr, struct held_lock *prev, - struct held_lock *next, int distance, int trylock_loop) + struct held_lock *next, int distance, int *stack_saved) { struct lock_list *entry; int ret; @@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, } } - if (!trylock_loop && !save_trace(&trace)) - return 0; + if (!*stack_saved) { + if (!save_trace(&trace)) + return 0; + *stack_saved = 1; + } /* * Ok, all validations passed, add the new lock @@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Debugging printouts: */ if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) { + /* We drop graph lock, so another thread can overwrite trace. */ + *stack_saved = 0; graph_unlock(); printk("\n new dependency: "); print_lock_name(hlock_class(prev)); @@ -1929,7 +1934,7 @@ static int check_prevs_add(struct task_struct *curr, struct held_lock *next) { int depth = curr->lockdep_depth; - int trylock_loop = 0; + int stack_saved = 0; struct held_lock *hlock; /* @@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) */ if (hlock->read != 2 && hlock->check) { if (!check_prev_add(curr, hlock, next, - distance, trylock_loop)) + distance, &stack_saved)) return 0; /* * Stop after the first non-trylock entry, @@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next) if (curr->held_locks[depth].irq_context != curr->held_locks[depth-1].irq_context) break; - trylock_loop = 1; } return 1; out_bug: @@ -2017,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, u64 chain_key) { struct lock_class *class = hlock_class(hlock); - struct list_head *hash_head = chainhashentry(chain_key); + struct hlist_head *hash_head = chainhashentry(chain_key); struct lock_chain *chain; struct held_lock *hlock_curr; int i, j; @@ -2033,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr, * We can walk it lock-free, because entries only get added * to the hash: */ - list_for_each_entry_rcu(chain, hash_head, entry) { + hlist_for_each_entry_rcu(chain, hash_head, entry) { if (chain->chain_key == chain_key) { cache_hit: debug_atomic_inc(chain_lookup_hits); @@ -2057,7 +2061,7 @@ cache_hit: /* * We have to walk the chain again locked - to avoid duplicates: */ - list_for_each_entry(chain, hash_head, entry) { + hlist_for_each_entry(chain, hash_head, entry) { if (chain->chain_key == chain_key) { graph_unlock(); goto cache_hit; @@ -2091,7 +2095,7 @@ cache_hit: } chain_hlocks[chain->base + j] = class - lock_classes; } - list_add_tail_rcu(&chain->entry, hash_head); + hlist_add_head_rcu(&chain->entry, hash_head); debug_atomic_inc(chain_lookup_misses); inc_chains(); @@ -3875,7 +3879,7 @@ void lockdep_reset(void) nr_process_chains = 0; debug_locks = 1; for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); raw_local_irq_restore(flags); } @@ -3894,7 +3898,7 @@ static void zap_class(struct lock_class *class) /* * Unhash the class and remove it from the all_lock_classes list: */ - list_del_rcu(&class->hash_entry); + hlist_del_rcu(&class->hash_entry); list_del_rcu(&class->lock_entry); RCU_INIT_POINTER(class->key, NULL); @@ -3917,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size) void lockdep_free_key_range(void *start, unsigned long size) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i; int locked; @@ -3930,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size) */ for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { if (within(class->key, start, size)) zap_class(class); else if (within(class->name, start, size)) @@ -3962,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size) void lockdep_reset_lock(struct lockdep_map *lock) { struct lock_class *class; - struct list_head *head; + struct hlist_head *head; unsigned long flags; int i, j; int locked; @@ -3987,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock) locked = graph_lock(); for (i = 0; i < CLASSHASH_SIZE; i++) { head = classhash_table + i; - if (list_empty(head)) - continue; - list_for_each_entry_rcu(class, head, hash_entry) { + hlist_for_each_entry_rcu(class, head, hash_entry) { int match = 0; for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++) @@ -4027,10 +4027,10 @@ void lockdep_init(void) return; for (i = 0; i < CLASSHASH_SIZE; i++) - INIT_LIST_HEAD(classhash_table + i); + INIT_HLIST_HEAD(classhash_table + i); for (i = 0; i < CHAINHASH_SIZE; i++) - INIT_LIST_HEAD(chainhash_table + i); + INIT_HLIST_HEAD(chainhash_table + i); lockdep_initialized = 1; } diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 8251e75dd9c0..3e746607abe5 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) * 2) Drop lock->wait_lock * 3) Try to unlock the lock with cmpxchg */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { struct task_struct *owner = rt_mutex_owner(lock); clear_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* * If a new waiter comes in between the unlock and the cmpxchg * we have two situations: @@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock) /* * Simple slow path only version: lock->owner is protected by lock->wait_lock. */ -static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock) +static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, + unsigned long flags) __releases(lock->wait_lock) { lock->owner = NULL; - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; } #endif @@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, int ret = 0, depth = 0; struct rt_mutex *lock; bool detect_deadlock; - unsigned long flags; bool requeue = true; detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk); @@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * [1] Task cannot go away as we did a get_task() before ! */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock_irq(&task->pi_lock); /* * [2] Get the waiter on which @task is blocked on. @@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * operations. */ if (!raw_spin_trylock(&lock->wait_lock)) { - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); cpu_relax(); goto retry; } @@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * No requeue[7] here. Just release @task [8] */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * If there is no owner of the lock, end of chain. */ if (!rt_mutex_owner(lock)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* * No requeue [11] here. We just do deadlock detection. @@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* If owner is not blocked, end of chain. */ if (!next_lock) @@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, rt_mutex_enqueue(lock, waiter); /* [8] Release the task */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); put_task_struct(task); /* @@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, */ if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) wake_up_process(rt_mutex_top_waiter(lock)->task); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 0; } /* [10] Grab the next task, i.e. the owner of @lock */ task = rt_mutex_owner(lock); get_task_struct(task); - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); /* [11] requeue the pi waiters if necessary */ if (waiter == rt_mutex_top_waiter(lock)) { @@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, top_waiter = rt_mutex_top_waiter(lock); /* [13] Drop the locks */ - raw_spin_unlock_irqrestore(&task->pi_lock, flags); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock(&task->pi_lock); + raw_spin_unlock_irq(&lock->wait_lock); /* * Make the actual exit decisions [12], based on the stored @@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, goto again; out_unlock_pi: - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock_irq(&task->pi_lock); out_put_task: put_task_struct(task); @@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, /* * Try to take an rt-mutex * - * Must be called with lock->wait_lock held. + * Must be called with lock->wait_lock held and interrupts disabled * * @lock: The lock to be acquired. * @task: The task which wants to acquire the lock @@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) { - unsigned long flags; - /* * Before testing whether we can acquire @lock, we set the * RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all @@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, * case, but conditionals are more expensive than a redundant * store. */ - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); task->pi_blocked_on = NULL; /* * Finish the lock acquisition. @task is the new owner. If @@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock)); - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); takeit: /* We got the lock. */ @@ -883,7 +882,7 @@ takeit: * * Prepare waiter and propagate pi chain * - * This must be called with lock->wait_lock held. + * This must be called with lock->wait_lock held and interrupts disabled */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, @@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *top_waiter = waiter; struct rt_mutex *next_lock; int chain_walk = 0, res; - unsigned long flags; /* * Early deadlock detection. We really don't want the task to @@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, if (owner == task) return -EDEADLK; - raw_spin_lock_irqsave(&task->pi_lock, flags); + raw_spin_lock(&task->pi_lock); __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; @@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, task->pi_blocked_on = waiter; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); + raw_spin_unlock(&task->pi_lock); if (!owner) return 0; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); if (waiter == rt_mutex_top_waiter(lock)) { rt_mutex_dequeue_pi(owner, top_waiter); rt_mutex_enqueue_pi(owner, waiter); @@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Even if full deadlock detection is on, if the owner is not * blocked itself, we can avoid finding this out in the chain @@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, chwalk, lock, next_lock, waiter, task); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); return res; } @@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * Remove the top waiter from the current tasks pi waiter tree and * queue it up. * - * Called with lock->wait_lock held. + * Called with lock->wait_lock held and interrupts disabled. */ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); waiter = rt_mutex_top_waiter(lock); @@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, */ lock->owner = (void *) RT_MUTEX_HAS_WAITERS; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); wake_q_add(wake_q, waiter->task); } @@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, /* * Remove a waiter from a lock and give up * - * Must be called with lock->wait_lock held and + * Must be called with lock->wait_lock held and interrupts disabled. I must * have just failed to try_to_take_rt_mutex(). */ static void remove_waiter(struct rt_mutex *lock, @@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock, bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); struct rt_mutex *next_lock; - unsigned long flags; - raw_spin_lock_irqsave(¤t->pi_lock, flags); + raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); current->pi_blocked_on = NULL; - raw_spin_unlock_irqrestore(¤t->pi_lock, flags); + raw_spin_unlock(¤t->pi_lock); /* * Only update priority if the waiter was the highest priority @@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock, if (!owner || !is_top_waiter) return; - raw_spin_lock_irqsave(&owner->pi_lock, flags); + raw_spin_lock(&owner->pi_lock); rt_mutex_dequeue_pi(owner, waiter); @@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock, /* Store the lock on which owner is blocked or NULL */ next_lock = task_blocked_on_lock(owner); - raw_spin_unlock_irqrestore(&owner->pi_lock, flags); + raw_spin_unlock(&owner->pi_lock); /* * Don't walk the chain, if the owner task is not blocked @@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock, /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(owner); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock, next_lock, NULL, current); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); } /* @@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task) * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take * @state: the state the task should block in (TASK_INTERRUPTIBLE - * or TASK_UNINTERRUPTIBLE) + * or TASK_UNINTERRUPTIBLE) * @timeout: the pre-initialized and started timer, or NULL for none * @waiter: the pre-initialized rt_mutex_waiter * - * lock->wait_lock must be held by the caller. + * Must be called with lock->wait_lock held and interrupts disabled */ static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, @@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, break; } - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); schedule(); - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); } @@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk) { struct rt_mutex_waiter waiter; + unsigned long flags; int ret = 0; debug_rt_mutex_init_waiter(&waiter); RB_CLEAR_NODE(&waiter.pi_tree_entry); RB_CLEAR_NODE(&waiter.tree_entry); - raw_spin_lock(&lock->wait_lock); + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } @@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* Remove pending timer: */ if (unlikely(timeout)) @@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, */ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) { + unsigned long flags; int ret; /* @@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) return 0; /* - * The mutex has currently no owner. Lock the wait lock and - * try to acquire the lock. + * The mutex has currently no owner. Lock the wait lock and try to + * acquire the lock. We use irqsave here to support early boot calls. */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); ret = try_to_take_rt_mutex(lock, current, NULL); @@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return ret; } @@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, struct wake_q_head *wake_q) { - raw_spin_lock(&lock->wait_lock); + unsigned long flags; + + /* irqsave required to support early boot calls */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); debug_rt_mutex_unlock(lock); @@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ while (!rt_mutex_has_waiters(lock)) { /* Drops lock->wait_lock ! */ - if (unlock_rt_mutex_safe(lock) == true) + if (unlock_rt_mutex_safe(lock, flags) == true) return false; /* Relock the rtmutex and try again */ - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irqsave(&lock->wait_lock, flags); } /* @@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ mark_wakeup_next_waiter(wake_q, lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); /* check PI boosting */ return true; @@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return 1; } @@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (unlikely(ret)) remove_waiter(lock, waiter); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); @@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, { int ret; - raw_spin_lock(&lock->wait_lock); + raw_spin_lock_irq(&lock->wait_lock); set_current_state(TASK_INTERRUPTIBLE); @@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, */ fixup_rt_mutex_waiters(lock); - raw_spin_unlock(&lock->wait_lock); + raw_spin_unlock_irq(&lock->wait_lock); return ret; } diff --git a/kernel/memremap.c b/kernel/memremap.c index 293309cac061..97b31c774274 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -115,7 +115,7 @@ EXPORT_SYMBOL(memunmap); static void devm_memremap_release(struct device *dev, void *res) { - memunmap(res); + memunmap(*(void **)res); } static int devm_memremap_match(struct device *dev, void *res, void *match_data) @@ -137,8 +137,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset, if (addr) { *ptr = addr; devres_add(dev, ptr); - } else + } else { devres_free(ptr); + return ERR_PTR(-ENXIO); + } return addr; } @@ -151,7 +153,7 @@ void devm_memunmap(struct device *dev, void *addr) } EXPORT_SYMBOL(devm_memunmap); -pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags) +pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags) { return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags); } @@ -184,7 +186,11 @@ EXPORT_SYMBOL(put_zone_device_page); static void pgmap_radix_release(struct resource *res) { - resource_size_t key; + resource_size_t key, align_start, align_size, align_end; + + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; mutex_lock(&pgmap_lock); for (key = res->start; key <= res->end; key += SECTION_SIZE) @@ -227,12 +233,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data) percpu_ref_put(pgmap->ref); } - pgmap_radix_release(res); - /* pages are dead and unused, undo the arch mapping */ align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); } @@ -268,7 +273,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, { int is_ram = region_intersects(res->start, resource_size(res), IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - resource_size_t key, align_start, align_size; + resource_size_t key, align_start, align_size, align_end; struct dev_pagemap *pgmap; struct page_map *page_map; unsigned long pfn; @@ -310,7 +315,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, mutex_lock(&pgmap_lock); error = 0; - for (key = res->start; key <= res->end; key += SECTION_SIZE) { + align_start = res->start & ~(SECTION_SIZE - 1); + align_size = ALIGN(resource_size(res), SECTION_SIZE); + align_end = align_start + align_size - 1; + for (key = align_start; key <= align_end; key += SECTION_SIZE) { struct dev_pagemap *dup; rcu_read_lock(); @@ -337,8 +345,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); - align_start = res->start & ~(SECTION_SIZE - 1); - align_size = ALIGN(resource_size(res), SECTION_SIZE); error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; diff --git a/kernel/module.c b/kernel/module.c index 8358f4697c0c..794ebe8e878d 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -303,6 +303,9 @@ struct load_info { struct _ddebug *debug; unsigned int num_debug; bool sig_ok; +#ifdef CONFIG_KALLSYMS + unsigned long mod_kallsyms_init_off; +#endif struct { unsigned int sym, str, mod, vers, info, pcpu; } index; @@ -981,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, mod->exit(); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); + async_synchronize_full(); /* Store the name of the last unloaded module for diagnostic purposes */ @@ -2480,10 +2485,21 @@ static void layout_symtab(struct module *mod, struct load_info *info) strsect->sh_flags |= SHF_ALLOC; strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect, info->index.str) | INIT_OFFSET_MASK; - mod->init_layout.size = debug_align(mod->init_layout.size); pr_debug("\t%s\n", info->secstrings + strsect->sh_name); + + /* We'll tack temporary mod_kallsyms on the end. */ + mod->init_layout.size = ALIGN(mod->init_layout.size, + __alignof__(struct mod_kallsyms)); + info->mod_kallsyms_init_off = mod->init_layout.size; + mod->init_layout.size += sizeof(struct mod_kallsyms); + mod->init_layout.size = debug_align(mod->init_layout.size); } +/* + * We use the full symtab and strtab which layout_symtab arranged to + * be appended to the init section. Later we switch to the cut-down + * core-only ones. + */ static void add_kallsyms(struct module *mod, const struct load_info *info) { unsigned int i, ndst; @@ -2492,29 +2508,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) char *s; Elf_Shdr *symsec = &info->sechdrs[info->index.sym]; - mod->symtab = (void *)symsec->sh_addr; - mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym); + /* Set up to point into init section. */ + mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off; + + mod->kallsyms->symtab = (void *)symsec->sh_addr; + mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym); /* Make sure we get permanent strtab: don't use info->strtab. */ - mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr; + mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr; /* Set types up while we still have access to sections. */ - for (i = 0; i < mod->num_symtab; i++) - mod->symtab[i].st_info = elf_type(&mod->symtab[i], info); - - mod->core_symtab = dst = mod->core_layout.base + info->symoffs; - mod->core_strtab = s = mod->core_layout.base + info->stroffs; - src = mod->symtab; - for (ndst = i = 0; i < mod->num_symtab; i++) { + for (i = 0; i < mod->kallsyms->num_symtab; i++) + mod->kallsyms->symtab[i].st_info + = elf_type(&mod->kallsyms->symtab[i], info); + + /* Now populate the cut down core kallsyms for after init. */ + mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs; + mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs; + src = mod->kallsyms->symtab; + for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) { if (i == 0 || is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum, info->index.pcpu)) { dst[ndst] = src[i]; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src[i].st_name], + dst[ndst++].st_name = s - mod->core_kallsyms.strtab; + s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name], KSYM_NAME_LEN) + 1; } } - mod->core_num_syms = ndst; + mod->core_kallsyms.num_symtab = ndst; } #else static inline void layout_symtab(struct module *mod, struct load_info *info) @@ -3263,9 +3284,8 @@ static noinline int do_init_module(struct module *mod) module_put(mod); trim_init_extable(mod); #ifdef CONFIG_KALLSYMS - mod->num_symtab = mod->core_num_syms; - mod->symtab = mod->core_symtab; - mod->strtab = mod->core_strtab; + /* Switch to core kallsyms now init is done: kallsyms may be walking! */ + rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms); #endif mod_tree_remove_init(mod); disable_ro_nx(&mod->init_layout); @@ -3295,6 +3315,7 @@ fail: module_put(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, mod); + ftrace_release_mod(mod); free_module(mod); wake_up_all(&module_wq); return ret; @@ -3371,6 +3392,7 @@ static int complete_formation(struct module *mod, struct load_info *info) mod->state = MODULE_STATE_COMING; mutex_unlock(&module_mutex); + ftrace_module_enable(mod); blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); return 0; @@ -3496,7 +3518,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Module is ready to execute: parsing args may do that. */ after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, - -32768, 32767, NULL, + -32768, 32767, mod, unknown_module_param_cb); if (IS_ERR(after_dashes)) { err = PTR_ERR(after_dashes); @@ -3627,6 +3649,11 @@ static inline int is_arm_mapping_symbol(const char *str) && (str[2] == '\0' || str[2] == '.'); } +static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum) +{ + return kallsyms->strtab + kallsyms->symtab[symnum].st_name; +} + static const char *get_ksymbol(struct module *mod, unsigned long addr, unsigned long *size, @@ -3634,6 +3661,7 @@ static const char *get_ksymbol(struct module *mod, { unsigned int i, best = 0; unsigned long nextval; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); /* At worse, next value is at end of module */ if (within_module_init(addr, mod)) @@ -3643,32 +3671,32 @@ static const char *get_ksymbol(struct module *mod, /* Scan for closest preceding symbol, and next symbol. (ELF starts real symbols at 1). */ - for (i = 1; i < mod->num_symtab; i++) { - if (mod->symtab[i].st_shndx == SHN_UNDEF) + for (i = 1; i < kallsyms->num_symtab; i++) { + if (kallsyms->symtab[i].st_shndx == SHN_UNDEF) continue; /* We ignore unnamed symbols: they're uninformative * and inserted at a whim. */ - if (mod->symtab[i].st_value <= addr - && mod->symtab[i].st_value > mod->symtab[best].st_value - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) + if (*symname(kallsyms, i) == '\0' + || is_arm_mapping_symbol(symname(kallsyms, i))) + continue; + + if (kallsyms->symtab[i].st_value <= addr + && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value) best = i; - if (mod->symtab[i].st_value > addr - && mod->symtab[i].st_value < nextval - && *(mod->strtab + mod->symtab[i].st_name) != '\0' - && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name)) - nextval = mod->symtab[i].st_value; + if (kallsyms->symtab[i].st_value > addr + && kallsyms->symtab[i].st_value < nextval) + nextval = kallsyms->symtab[i].st_value; } if (!best) return NULL; if (size) - *size = nextval - mod->symtab[best].st_value; + *size = nextval - kallsyms->symtab[best].st_value; if (offset) - *offset = addr - mod->symtab[best].st_value; - return mod->strtab + mod->symtab[best].st_name; + *offset = addr - kallsyms->symtab[best].st_value; + return symname(kallsyms, best); } /* For kallsyms to ask for address resolution. NULL means not found. Careful @@ -3758,19 +3786,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + struct mod_kallsyms *kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - if (symnum < mod->num_symtab) { - *value = mod->symtab[symnum].st_value; - *type = mod->symtab[symnum].st_info; - strlcpy(name, mod->strtab + mod->symtab[symnum].st_name, - KSYM_NAME_LEN); + kallsyms = rcu_dereference_sched(mod->kallsyms); + if (symnum < kallsyms->num_symtab) { + *value = kallsyms->symtab[symnum].st_value; + *type = kallsyms->symtab[symnum].st_info; + strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN); strlcpy(module_name, mod->name, MODULE_NAME_LEN); *exported = is_exported(name, *value, mod); preempt_enable(); return 0; } - symnum -= mod->num_symtab; + symnum -= kallsyms->num_symtab; } preempt_enable(); return -ERANGE; @@ -3779,11 +3809,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, static unsigned long mod_find_symname(struct module *mod, const char *name) { unsigned int i; + struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms); - for (i = 0; i < mod->num_symtab; i++) - if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 && - mod->symtab[i].st_info != 'U') - return mod->symtab[i].st_value; + for (i = 0; i < kallsyms->num_symtab; i++) + if (strcmp(name, symname(kallsyms, i)) == 0 && + kallsyms->symtab[i].st_info != 'U') + return kallsyms->symtab[i].st_value; return 0; } @@ -3822,11 +3853,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, module_assert_mutex(); list_for_each_entry(mod, &modules, list) { + /* We hold module_mutex: no need for rcu_dereference_sched */ + struct mod_kallsyms *kallsyms = mod->kallsyms; + if (mod->state == MODULE_STATE_UNFORMED) continue; - for (i = 0; i < mod->num_symtab; i++) { - ret = fn(data, mod->strtab + mod->symtab[i].st_name, - mod, mod->symtab[i].st_value); + for (i = 0; i < kallsyms->num_symtab; i++) { + ret = fn(data, symname(kallsyms, i), + mod, kallsyms->symtab[i].st_value); if (ret != 0) return ret; } diff --git a/kernel/pid.c b/kernel/pid.c index f4ad91b746f1..4d73a834c7e6 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -588,7 +588,7 @@ void __init pidhash_init(void) void __init pidmap_init(void) { - /* Veryify no one has done anything silly */ + /* Verify no one has done anything silly: */ BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING); /* bump default and minimum pid_max based on number of cpus */ diff --git a/kernel/resource.c b/kernel/resource.c index 49834309043c..4d466052426b 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1098,9 +1098,10 @@ struct resource * __request_region(struct resource *parent, if (!conflict) break; if (conflict != parent) { - parent = conflict; - if (!(conflict->flags & IORESOURCE_BUSY)) + if (!(conflict->flags & IORESOURCE_BUSY)) { + parent = conflict; continue; + } } if (conflict->flags & flags & IORESOURCE_MUXED) { add_wait_queue(&muxed_resource_wait, &wait); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 63d3a24e081a..9503d590e5ef 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6840,7 +6840,7 @@ static void sched_init_numa(void) sched_domains_numa_masks[i][j] = mask; - for (k = 0; k < nr_node_ids; k++) { + for_each_node(k) { if (node_distance(j, k) > sched_domains_numa_distance[i]) continue; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index cd64c979d0e1..57b939c81bce 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -420,7 +420,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se, * entity. */ if (dl_time_before(dl_se->deadline, rq_clock(rq))) { - printk_deferred_once("sched: DL replenish lagged to much\n"); + printk_deferred_once("sched: DL replenish lagged too much\n"); dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline; dl_se->runtime = pi_se->dl_runtime; } diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1926606ece80..56b7d4b83947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1220,8 +1220,6 @@ static void task_numa_assign(struct task_numa_env *env, { if (env->best_task) put_task_struct(env->best_task); - if (p) - get_task_struct(p); env->best_task = p; env->best_imp = imp; @@ -1289,20 +1287,30 @@ static void task_numa_compare(struct task_numa_env *env, long imp = env->p->numa_group ? groupimp : taskimp; long moveimp = imp; int dist = env->dist; + bool assigned = false; rcu_read_lock(); raw_spin_lock_irq(&dst_rq->lock); cur = dst_rq->curr; /* - * No need to move the exiting task, and this ensures that ->curr - * wasn't reaped and thus get_task_struct() in task_numa_assign() - * is safe under RCU read lock. - * Note that rcu_read_lock() itself can't protect from the final - * put_task_struct() after the last schedule(). + * No need to move the exiting task or idle task. */ if ((cur->flags & PF_EXITING) || is_idle_task(cur)) cur = NULL; + else { + /* + * The task_struct must be protected here to protect the + * p->numa_faults access in the task_weight since the + * numa_faults could already be freed in the following path: + * finish_task_switch() + * --> put_task_struct() + * --> __put_task_struct() + * --> task_numa_free() + */ + get_task_struct(cur); + } + raw_spin_unlock_irq(&dst_rq->lock); /* @@ -1386,6 +1394,7 @@ balance: */ if (!load_too_imbalanced(src_load, dst_load, env)) { imp = moveimp - 1; + put_task_struct(cur); cur = NULL; goto assign; } @@ -1411,9 +1420,16 @@ balance: env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu); assign: + assigned = true; task_numa_assign(env, cur, imp); unlock: rcu_read_unlock(); + /* + * The dst_rq->curr isn't assigned. The protection for task_struct is + * finished. + */ + if (cur && !assigned) + put_task_struct(cur); } static void task_numa_find_cpu(struct task_numa_env *env, diff --git a/kernel/signal.c b/kernel/signal.c index f3f1f7a972fd..0508544c8ced 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set) current->saved_sigmask = current->blocked; set_current_blocked(set); - __set_current_state(TASK_INTERRUPTIBLE); - schedule(); + while (!signal_pending(current)) { + __set_current_state(TASK_INTERRUPTIBLE); + schedule(); + } set_restore_sigmask(); return -ERESTARTNOHAND; } diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 435b8850dd80..fa909f9fd559 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -897,10 +897,10 @@ static int enqueue_hrtimer(struct hrtimer *timer, */ static void __remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, - unsigned long newstate, int reprogram) + u8 newstate, int reprogram) { struct hrtimer_cpu_base *cpu_base = base->cpu_base; - unsigned int state = timer->state; + u8 state = timer->state; timer->state = newstate; if (!(state & HRTIMER_STATE_ENQUEUED)) @@ -930,7 +930,7 @@ static inline int remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart) { if (hrtimer_is_queued(timer)) { - unsigned long state = timer->state; + u8 state = timer->state; int reprogram; /* @@ -954,6 +954,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest return 0; } +static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim, + const enum hrtimer_mode mode) +{ +#ifdef CONFIG_TIME_LOW_RES + /* + * CONFIG_TIME_LOW_RES indicates that the system has no way to return + * granular time values. For relative timers we add hrtimer_resolution + * (i.e. one jiffie) to prevent short timeouts. + */ + timer->is_rel = mode & HRTIMER_MODE_REL; + if (timer->is_rel) + tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); +#endif + return tim; +} + /** * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU * @timer: the timer to be added @@ -974,19 +990,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, /* Remove an active timer from the queue: */ remove_hrtimer(timer, base, true); - if (mode & HRTIMER_MODE_REL) { + if (mode & HRTIMER_MODE_REL) tim = ktime_add_safe(tim, base->get_time()); - /* - * CONFIG_TIME_LOW_RES is a temporary way for architectures - * to signal that they simply return xtime in - * do_gettimeoffset(). In this case we want to round up by - * resolution when starting a relative timer, to avoid short - * timeouts. This will go away with the GTOD framework. - */ -#ifdef CONFIG_TIME_LOW_RES - tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution)); -#endif - } + + tim = hrtimer_update_lowres(timer, tim, mode); hrtimer_set_expires_range_ns(timer, tim, delta_ns); @@ -1074,19 +1081,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel); /** * hrtimer_get_remaining - get remaining time for the timer * @timer: the timer to read + * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y */ -ktime_t hrtimer_get_remaining(const struct hrtimer *timer) +ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust) { unsigned long flags; ktime_t rem; lock_hrtimer_base(timer, &flags); - rem = hrtimer_expires_remaining(timer); + if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust) + rem = hrtimer_expires_remaining_adjusted(timer); + else + rem = hrtimer_expires_remaining(timer); unlock_hrtimer_base(timer, &flags); return rem; } -EXPORT_SYMBOL_GPL(hrtimer_get_remaining); +EXPORT_SYMBOL_GPL(__hrtimer_get_remaining); #ifdef CONFIG_NO_HZ_COMMON /** @@ -1220,6 +1231,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base, fn = timer->function; /* + * Clear the 'is relative' flag for the TIME_LOW_RES case. If the + * timer is restarted with a period then it becomes an absolute + * timer. If its not restarted it does not matter. + */ + if (IS_ENABLED(CONFIG_TIME_LOW_RES)) + timer->is_rel = false; + + /* * Because we run timers from hardirq context, there is no chance * they get migrated to another cpu, therefore its safe to unlock * the timer base. diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c index 8d262b467573..1d5c7204ddc9 100644 --- a/kernel/time/itimer.c +++ b/kernel/time/itimer.c @@ -26,7 +26,7 @@ */ static struct timeval itimer_get_remtime(struct hrtimer *timer) { - ktime_t rem = hrtimer_get_remaining(timer); + ktime_t rem = __hrtimer_get_remaining(timer, true); /* * Racy but safe: if the itimer expires after the above diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 36f2ca09aa5e..6df8927c58a5 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -685,8 +685,18 @@ int ntp_validate_timex(struct timex *txc) if (!capable(CAP_SYS_TIME)) return -EPERM; - if (!timeval_inject_offset_valid(&txc->time)) - return -EINVAL; + if (txc->modes & ADJ_NANO) { + struct timespec ts; + + ts.tv_sec = txc->time.tv_sec; + ts.tv_nsec = txc->time.tv_usec; + if (!timespec_inject_offset_valid(&ts)) + return -EINVAL; + + } else { + if (!timeval_inject_offset_valid(&txc->time)) + return -EINVAL; + } } /* diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 31d11ac9fa47..f2826c35e918 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting) (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE)) timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv); - remaining = ktime_sub(hrtimer_get_expires(timer), now); + remaining = __hrtimer_expires_remaining_adjusted(timer, now); /* Return 0 only, when the timer is expired and not pending */ if (remaining.tv64 <= 0) { /* diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 9d7a053545f5..0b17424349eb 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -36,16 +36,17 @@ */ static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched); -/* - * The time, when the last jiffy update happened. Protected by jiffies_lock. - */ -static ktime_t last_jiffies_update; - struct tick_sched *tick_get_tick_sched(int cpu) { return &per_cpu(tick_cpu_sched, cpu); } +#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS) +/* + * The time, when the last jiffy update happened. Protected by jiffies_lock. + */ +static ktime_t last_jiffies_update; + /* * Must be called with interrupts disabled ! */ @@ -151,6 +152,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs) update_process_times(user_mode(regs)); profile_tick(CPU_PROFILING); } +#endif #ifdef CONFIG_NO_HZ_FULL cpumask_var_t tick_nohz_full_mask; @@ -993,9 +995,9 @@ static void tick_nohz_switch_to_nohz(void) /* Get the next period */ next = tick_init_jiffy_update(); - hrtimer_forward_now(&ts->sched_timer, tick_period); hrtimer_set_expires(&ts->sched_timer, next); - tick_program_event(next, 1); + hrtimer_forward_now(&ts->sched_timer, tick_period); + tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1); tick_nohz_activate(ts, NOHZ_MODE_LOWRES); } diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index f75e35b60149..ba7d8b288bb3 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, print_name_offset(m, taddr); SEQ_printf(m, ", "); print_name_offset(m, timer->function); - SEQ_printf(m, ", S:%02lx", timer->state); + SEQ_printf(m, ", S:%02x", timer->state); #ifdef CONFIG_TIMER_STATS SEQ_printf(m, ", "); print_name_offset(m, timer->start_site); diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c index 45dd798bcd37..326a75e884db 100644 --- a/kernel/trace/bpf_trace.c +++ b/kernel/trace/bpf_trace.c @@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5) struct bpf_map *map = (struct bpf_map *) (unsigned long) r1; struct bpf_array *array = container_of(map, struct bpf_array, map); struct perf_event *event; + struct file *file; if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (!event) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + /* make sure event is local and doesn't have pmu::count */ if (event->oncpu != smp_processor_id() || event->pmu->count) @@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) void *data = (void *) (long) r4; struct perf_sample_data sample_data; struct perf_event *event; + struct file *file; struct perf_raw_record raw = { .size = size, .data = data, @@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size) if (unlikely(index >= array->map.max_entries)) return -E2BIG; - event = (struct perf_event *)array->ptrs[index]; - if (unlikely(!event)) + file = (struct file *)array->ptrs[index]; + if (unlikely(!file)) return -ENOENT; + event = file->private_data; + if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE || event->attr.config != PERF_COUNT_SW_BPF_OUTPUT)) return -EINVAL; diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eca592f977b2..57a6eea84694 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4961,7 +4961,7 @@ void ftrace_release_mod(struct module *mod) mutex_unlock(&ftrace_lock); } -static void ftrace_module_enable(struct module *mod) +void ftrace_module_enable(struct module *mod) { struct dyn_ftrace *rec; struct ftrace_page *pg; @@ -5038,38 +5038,8 @@ void ftrace_module_init(struct module *mod) ftrace_process_locs(mod, mod->ftrace_callsites, mod->ftrace_callsites + mod->num_ftrace_callsites); } - -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - struct module *mod = data; - - switch (val) { - case MODULE_STATE_COMING: - ftrace_module_enable(mod); - break; - case MODULE_STATE_GOING: - ftrace_release_mod(mod); - break; - default: - break; - } - - return 0; -} -#else -static int ftrace_module_notify(struct notifier_block *self, - unsigned long val, void *data) -{ - return 0; -} #endif /* CONFIG_MODULES */ -struct notifier_block ftrace_module_nb = { - .notifier_call = ftrace_module_notify, - .priority = INT_MIN, /* Run after anything that can remove kprobes */ -}; - void __init ftrace_init(void) { extern unsigned long __start_mcount_loc[]; @@ -5098,10 +5068,6 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); - ret = register_module_notifier(&ftrace_module_nb); - if (ret) - pr_warning("Failed to register trace ftrace module exit notifier\n"); - set_ftrace_early_filters(); return; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f333e57c4614..ab09829d3b97 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -869,7 +869,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos) * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. */ - if (call->class && call->class->reg) + if (call->class && call->class->reg && + !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE)) return file; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index dda9e6742950..2a1abbaca10e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -126,6 +126,13 @@ check_stack(unsigned long ip, unsigned long *stack) } /* + * Some archs may not have the passed in ip in the dump. + * If that happens, we need to show everything. + */ + if (i == stack_trace_max.nr_entries) + i = 0; + + /* * Now find where in the stack these are. */ x = 0; @@ -149,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack) for (; p < top && i < stack_trace_max.nr_entries; p++) { if (stack_dump_trace[i] == ULONG_MAX) break; - if (*p == stack_dump_trace[i]) { + /* + * The READ_ONCE_NOCHECK is used to let KASAN know that + * this is not a stack-out-of-bounds error. + */ + if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) { stack_dump_trace[x] = stack_dump_trace[i++]; this_size = stack_trace_index[x++] = (top - p) * sizeof(unsigned long); diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 61a0264e28f9..7ff5dc7d2ac5 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -301,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */ static LIST_HEAD(workqueues); /* PR: list of all workqueues */ static bool workqueue_freezing; /* PL: have wqs started freezing? */ -static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */ +/* PL: allowable cpus for unbound wqs and work items */ +static cpumask_var_t wq_unbound_cpumask; + +/* CPU where unbound work was last round robin scheduled from this CPU */ +static DEFINE_PER_CPU(int, wq_rr_cpu_last); + +/* + * Local execution of unbound work items is no longer guaranteed. The + * following always forces round-robin CPU selection on unbound work items + * to uncover usages which depend on it. + */ +#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU +static bool wq_debug_force_rr_cpu = true; +#else +static bool wq_debug_force_rr_cpu = false; +#endif +module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644); /* the per-cpu worker pools */ static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS], @@ -570,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq, int node) { assert_rcu_or_wq_mutex_or_pool_mutex(wq); + + /* + * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a + * delayed item is pending. The plan is to keep CPU -> NODE + * mapping valid and stable across CPU on/offlines. Once that + * happens, this workaround can be removed. + */ + if (unlikely(node == NUMA_NO_NODE)) + return wq->dfl_pwq; + return rcu_dereference_raw(wq->numa_pwq_tbl[node]); } @@ -1298,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq) return worker && worker->current_pwq->wq == wq; } +/* + * When queueing an unbound work item to a wq, prefer local CPU if allowed + * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to + * avoid perturbing sensitive tasks. + */ +static int wq_select_unbound_cpu(int cpu) +{ + static bool printed_dbg_warning; + int new_cpu; + + if (likely(!wq_debug_force_rr_cpu)) { + if (cpumask_test_cpu(cpu, wq_unbound_cpumask)) + return cpu; + } else if (!printed_dbg_warning) { + pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n"); + printed_dbg_warning = true; + } + + if (cpumask_empty(wq_unbound_cpumask)) + return cpu; + + new_cpu = __this_cpu_read(wq_rr_cpu_last); + new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) { + new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask); + if (unlikely(new_cpu >= nr_cpu_ids)) + return cpu; + } + __this_cpu_write(wq_rr_cpu_last, new_cpu); + + return new_cpu; +} + static void __queue_work(int cpu, struct workqueue_struct *wq, struct work_struct *work) { @@ -1323,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq, return; retry: if (req_cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); + cpu = wq_select_unbound_cpu(raw_smp_processor_id()); /* pwq which will be used unless @work is executing elsewhere */ if (!(wq->flags & WQ_UNBOUND)) @@ -1464,13 +1523,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq, timer_stats_timer_set_start_info(&dwork->timer); dwork->wq = wq; - /* timer isn't guaranteed to run in this cpu, record earlier */ - if (cpu == WORK_CPU_UNBOUND) - cpu = raw_smp_processor_id(); dwork->cpu = cpu; timer->expires = jiffies + delay; - add_timer_on(timer, cpu); + if (unlikely(cpu != WORK_CPU_UNBOUND)) + add_timer_on(timer, cpu); + else + add_timer(timer); } /** @@ -2355,7 +2414,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq, WARN_ONCE(current->flags & PF_MEMALLOC, "workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf", current->pid, current->comm, target_wq->name, target_func); - WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM), + WARN_ONCE(worker && ((worker->current_pwq->wq->flags & + (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM), "workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf", worker->current_pwq->wq->name, worker->current_func, target_wq->name, target_func); |