summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/arraymap.c21
-rw-r--r--kernel/bpf/verifier.c2
-rw-r--r--kernel/cgroup.c31
-rw-r--r--kernel/cpuset.c71
-rw-r--r--kernel/events/core.c1387
-rw-r--r--kernel/events/hw_breakpoint.c2
-rw-r--r--kernel/events/ring_buffer.c40
-rw-r--r--kernel/futex.c18
-rw-r--r--kernel/irq/handle.c5
-rw-r--r--kernel/irq/irqdomain.c11
-rw-r--r--kernel/locking/lockdep.c58
-rw-r--r--kernel/locking/rtmutex.c135
-rw-r--r--kernel/memremap.c26
-rw-r--r--kernel/module.c124
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/resource.c5
-rw-r--r--kernel/sched/core.c2
-rw-r--r--kernel/sched/deadline.c2
-rw-r--r--kernel/sched/fair.c30
-rw-r--r--kernel/signal.c6
-rw-r--r--kernel/time/hrtimer.c55
-rw-r--r--kernel/time/itimer.c2
-rw-r--r--kernel/time/ntp.c14
-rw-r--r--kernel/time/posix-timers.c2
-rw-r--r--kernel/time/tick-sched.c16
-rw-r--r--kernel/time/timer_list.c2
-rw-r--r--kernel/trace/bpf_trace.c14
-rw-r--r--kernel/trace/ftrace.c36
-rw-r--r--kernel/trace/trace_events.c3
-rw-r--r--kernel/trace/trace_stack.c13
-rw-r--r--kernel/workqueue.c74
31 files changed, 1255 insertions, 954 deletions
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c
index b0799bced518..89ebbc4d1164 100644
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -291,10 +291,13 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
{
struct perf_event *event;
const struct perf_event_attr *attr;
+ struct file *file;
- event = perf_event_get(fd);
- if (IS_ERR(event))
- return event;
+ file = perf_event_get(fd);
+ if (IS_ERR(file))
+ return file;
+
+ event = file->private_data;
attr = perf_event_attrs(event);
if (IS_ERR(attr))
@@ -304,24 +307,22 @@ static void *perf_event_fd_array_get_ptr(struct bpf_map *map, int fd)
goto err;
if (attr->type == PERF_TYPE_RAW)
- return event;
+ return file;
if (attr->type == PERF_TYPE_HARDWARE)
- return event;
+ return file;
if (attr->type == PERF_TYPE_SOFTWARE &&
attr->config == PERF_COUNT_SW_BPF_OUTPUT)
- return event;
+ return file;
err:
- perf_event_release_kernel(event);
+ fput(file);
return ERR_PTR(-EINVAL);
}
static void perf_event_fd_array_put_ptr(void *ptr)
{
- struct perf_event *event = ptr;
-
- perf_event_release_kernel(event);
+ fput((struct file *)ptr);
}
static const struct bpf_map_ops perf_event_array_ops = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index d1d3e8f57de9..2e7f7ab739e4 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -2082,7 +2082,7 @@ static void adjust_branches(struct bpf_prog *prog, int pos, int delta)
/* adjust offset of jmps if necessary */
if (i < pos && i + insn->off + 1 > pos)
insn->off += delta;
- else if (i > pos && i + insn->off + 1 < pos)
+ else if (i > pos + delta && i + insn->off + 1 <= pos + delta)
insn->off -= delta;
}
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c03a640ef6da..d27904c193da 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -58,6 +58,7 @@
#include <linux/kthread.h>
#include <linux/delay.h>
#include <linux/atomic.h>
+#include <linux/cpuset.h>
#include <net/sock.h>
/*
@@ -2739,6 +2740,7 @@ out_unlock_rcu:
out_unlock_threadgroup:
percpu_up_write(&cgroup_threadgroup_rwsem);
cgroup_kn_unlock(of->kn);
+ cpuset_post_attach_flush();
return ret ?: nbytes;
}
@@ -4655,14 +4657,15 @@ static void css_free_work_fn(struct work_struct *work)
if (ss) {
/* css free path */
+ struct cgroup_subsys_state *parent = css->parent;
int id = css->id;
- if (css->parent)
- css_put(css->parent);
-
ss->css_free(css);
cgroup_idr_remove(&ss->css_idr, id);
cgroup_put(cgrp);
+
+ if (parent)
+ css_put(parent);
} else {
/* cgroup free path */
atomic_dec(&cgrp->root->nr_cgrps);
@@ -4758,6 +4761,7 @@ static void init_and_link_css(struct cgroup_subsys_state *css,
INIT_LIST_HEAD(&css->sibling);
INIT_LIST_HEAD(&css->children);
css->serial_nr = css_serial_nr_next++;
+ atomic_set(&css->online_cnt, 0);
if (cgroup_parent(cgrp)) {
css->parent = cgroup_css(cgroup_parent(cgrp), ss);
@@ -4780,6 +4784,10 @@ static int online_css(struct cgroup_subsys_state *css)
if (!ret) {
css->flags |= CSS_ONLINE;
rcu_assign_pointer(css->cgroup->subsys[ss->id], css);
+
+ atomic_inc(&css->online_cnt);
+ if (css->parent)
+ atomic_inc(&css->parent->online_cnt);
}
return ret;
}
@@ -5017,10 +5025,15 @@ static void css_killed_work_fn(struct work_struct *work)
container_of(work, struct cgroup_subsys_state, destroy_work);
mutex_lock(&cgroup_mutex);
- offline_css(css);
- mutex_unlock(&cgroup_mutex);
- css_put(css);
+ do {
+ offline_css(css);
+ css_put(css);
+ /* @css can't go away while we're holding cgroup_mutex */
+ css = css->parent;
+ } while (css && atomic_dec_and_test(&css->online_cnt));
+
+ mutex_unlock(&cgroup_mutex);
}
/* css kill confirmation processing requires process context, bounce */
@@ -5029,8 +5042,10 @@ static void css_killed_ref_fn(struct percpu_ref *ref)
struct cgroup_subsys_state *css =
container_of(ref, struct cgroup_subsys_state, refcnt);
- INIT_WORK(&css->destroy_work, css_killed_work_fn);
- queue_work(cgroup_destroy_wq, &css->destroy_work);
+ if (atomic_dec_and_test(&css->online_cnt)) {
+ INIT_WORK(&css->destroy_work, css_killed_work_fn);
+ queue_work(cgroup_destroy_wq, &css->destroy_work);
+ }
}
/**
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3e945fcd8179..41989ab4db57 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -287,6 +287,8 @@ static struct cpuset top_cpuset = {
static DEFINE_MUTEX(cpuset_mutex);
static DEFINE_SPINLOCK(callback_lock);
+static struct workqueue_struct *cpuset_migrate_mm_wq;
+
/*
* CPU / memory hotplug is handled asynchronously.
*/
@@ -972,31 +974,51 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
}
/*
- * cpuset_migrate_mm
- *
- * Migrate memory region from one set of nodes to another.
- *
- * Temporarilly set tasks mems_allowed to target nodes of migration,
- * so that the migration code can allocate pages on these nodes.
- *
- * While the mm_struct we are migrating is typically from some
- * other task, the task_struct mems_allowed that we are hacking
- * is for our current task, which must allocate new pages for that
- * migrating memory region.
+ * Migrate memory region from one set of nodes to another. This is
+ * performed asynchronously as it can be called from process migration path
+ * holding locks involved in process management. All mm migrations are
+ * performed in the queued order and can be waited for by flushing
+ * cpuset_migrate_mm_wq.
*/
+struct cpuset_migrate_mm_work {
+ struct work_struct work;
+ struct mm_struct *mm;
+ nodemask_t from;
+ nodemask_t to;
+};
+
+static void cpuset_migrate_mm_workfn(struct work_struct *work)
+{
+ struct cpuset_migrate_mm_work *mwork =
+ container_of(work, struct cpuset_migrate_mm_work, work);
+
+ /* on a wq worker, no need to worry about %current's mems_allowed */
+ do_migrate_pages(mwork->mm, &mwork->from, &mwork->to, MPOL_MF_MOVE_ALL);
+ mmput(mwork->mm);
+ kfree(mwork);
+}
+
static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
const nodemask_t *to)
{
- struct task_struct *tsk = current;
-
- tsk->mems_allowed = *to;
+ struct cpuset_migrate_mm_work *mwork;
- do_migrate_pages(mm, from, to, MPOL_MF_MOVE_ALL);
+ mwork = kzalloc(sizeof(*mwork), GFP_KERNEL);
+ if (mwork) {
+ mwork->mm = mm;
+ mwork->from = *from;
+ mwork->to = *to;
+ INIT_WORK(&mwork->work, cpuset_migrate_mm_workfn);
+ queue_work(cpuset_migrate_mm_wq, &mwork->work);
+ } else {
+ mmput(mm);
+ }
+}
- rcu_read_lock();
- guarantee_online_mems(task_cs(tsk), &tsk->mems_allowed);
- rcu_read_unlock();
+void cpuset_post_attach_flush(void)
+{
+ flush_workqueue(cpuset_migrate_mm_wq);
}
/*
@@ -1097,7 +1119,8 @@ static void update_tasks_nodemask(struct cpuset *cs)
mpol_rebind_mm(mm, &cs->mems_allowed);
if (migrate)
cpuset_migrate_mm(mm, &cs->old_mems_allowed, &newmems);
- mmput(mm);
+ else
+ mmput(mm);
}
css_task_iter_end(&it);
@@ -1545,11 +1568,11 @@ static void cpuset_attach(struct cgroup_taskset *tset)
* @old_mems_allowed is the right nodesets that we
* migrate mm from.
*/
- if (is_memory_migrate(cs)) {
+ if (is_memory_migrate(cs))
cpuset_migrate_mm(mm, &oldcs->old_mems_allowed,
&cpuset_attach_nodemask_to);
- }
- mmput(mm);
+ else
+ mmput(mm);
}
}
@@ -1714,6 +1737,7 @@ out_unlock:
mutex_unlock(&cpuset_mutex);
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
+ flush_workqueue(cpuset_migrate_mm_wq);
return retval ?: nbytes;
}
@@ -2359,6 +2383,9 @@ void __init cpuset_init_smp(void)
top_cpuset.effective_mems = node_states[N_MEMORY];
register_hotmemory_notifier(&cpuset_track_online_nodes_nb);
+
+ cpuset_migrate_mm_wq = alloc_ordered_workqueue("cpuset_migrate_mm", 0);
+ BUG_ON(!cpuset_migrate_mm_wq);
}
/**
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 06ae52e99ac2..614614821f00 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -49,8 +49,6 @@
#include <asm/irq_regs.h>
-static struct workqueue_struct *perf_wq;
-
typedef int (*remote_function_f)(void *);
struct remote_function_call {
@@ -66,8 +64,17 @@ static void remote_function(void *data)
struct task_struct *p = tfc->p;
if (p) {
- tfc->ret = -EAGAIN;
- if (task_cpu(p) != smp_processor_id() || !task_curr(p))
+ /* -EAGAIN */
+ if (task_cpu(p) != smp_processor_id())
+ return;
+
+ /*
+ * Now that we're on right CPU with IRQs disabled, we can test
+ * if we hit the right task without races.
+ */
+
+ tfc->ret = -ESRCH; /* No such (running) process */
+ if (p != current)
return;
}
@@ -94,13 +101,17 @@ task_function_call(struct task_struct *p, remote_function_f func, void *info)
.p = p,
.func = func,
.info = info,
- .ret = -ESRCH, /* No such (running) process */
+ .ret = -EAGAIN,
};
+ int ret;
- if (task_curr(p))
- smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+ do {
+ ret = smp_call_function_single(task_cpu(p), remote_function, &data, 1);
+ if (!ret)
+ ret = data.ret;
+ } while (ret == -EAGAIN);
- return data.ret;
+ return ret;
}
/**
@@ -126,44 +137,170 @@ static int cpu_function_call(int cpu, remote_function_f func, void *info)
return data.ret;
}
-static void event_function_call(struct perf_event *event,
- int (*active)(void *),
- void (*inactive)(void *),
- void *data)
+static inline struct perf_cpu_context *
+__get_cpu_context(struct perf_event_context *ctx)
+{
+ return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
+}
+
+static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx)
+ raw_spin_lock(&ctx->lock);
+}
+
+static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (ctx)
+ raw_spin_unlock(&ctx->lock);
+ raw_spin_unlock(&cpuctx->ctx.lock);
+}
+
+#define TASK_TOMBSTONE ((void *)-1L)
+
+static bool is_kernel_event(struct perf_event *event)
{
+ return READ_ONCE(event->owner) == TASK_TOMBSTONE;
+}
+
+/*
+ * On task ctx scheduling...
+ *
+ * When !ctx->nr_events a task context will not be scheduled. This means
+ * we can disable the scheduler hooks (for performance) without leaving
+ * pending task ctx state.
+ *
+ * This however results in two special cases:
+ *
+ * - removing the last event from a task ctx; this is relatively straight
+ * forward and is done in __perf_remove_from_context.
+ *
+ * - adding the first event to a task ctx; this is tricky because we cannot
+ * rely on ctx->is_active and therefore cannot use event_function_call().
+ * See perf_install_in_context().
+ *
+ * If ctx->nr_events, then ctx->is_active and cpuctx->task_ctx are set.
+ */
+
+typedef void (*event_f)(struct perf_event *, struct perf_cpu_context *,
+ struct perf_event_context *, void *);
+
+struct event_function_struct {
+ struct perf_event *event;
+ event_f func;
+ void *data;
+};
+
+static int event_function(void *info)
+{
+ struct event_function_struct *efs = info;
+ struct perf_event *event = efs->event;
struct perf_event_context *ctx = event->ctx;
- struct task_struct *task = ctx->task;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event_context *task_ctx = cpuctx->task_ctx;
+ int ret = 0;
+
+ WARN_ON_ONCE(!irqs_disabled());
+
+ perf_ctx_lock(cpuctx, task_ctx);
+ /*
+ * Since we do the IPI call without holding ctx->lock things can have
+ * changed, double check we hit the task we set out to hit.
+ */
+ if (ctx->task) {
+ if (ctx->task != current) {
+ ret = -ESRCH;
+ goto unlock;
+ }
+
+ /*
+ * We only use event_function_call() on established contexts,
+ * and event_function() is only ever called when active (or
+ * rather, we'll have bailed in task_function_call() or the
+ * above ctx->task != current test), therefore we must have
+ * ctx->is_active here.
+ */
+ WARN_ON_ONCE(!ctx->is_active);
+ /*
+ * And since we have ctx->is_active, cpuctx->task_ctx must
+ * match.
+ */
+ WARN_ON_ONCE(task_ctx != ctx);
+ } else {
+ WARN_ON_ONCE(&cpuctx->ctx != ctx);
+ }
+
+ efs->func(event, cpuctx, ctx, efs->data);
+unlock:
+ perf_ctx_unlock(cpuctx, task_ctx);
+
+ return ret;
+}
+
+static void event_function_local(struct perf_event *event, event_f func, void *data)
+{
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };
+
+ int ret = event_function(&efs);
+ WARN_ON_ONCE(ret);
+}
+
+static void event_function_call(struct perf_event *event, event_f func, void *data)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct task_struct *task = READ_ONCE(ctx->task); /* verified in event_function */
+ struct event_function_struct efs = {
+ .event = event,
+ .func = func,
+ .data = data,
+ };
+
+ if (!event->parent) {
+ /*
+ * If this is a !child event, we must hold ctx::mutex to
+ * stabilize the the event->ctx relation. See
+ * perf_event_ctx_lock().
+ */
+ lockdep_assert_held(&ctx->mutex);
+ }
if (!task) {
- cpu_function_call(event->cpu, active, data);
+ cpu_function_call(event->cpu, event_function, &efs);
return;
}
+ if (task == TASK_TOMBSTONE)
+ return;
+
again:
- if (!task_function_call(task, active, data))
+ if (!task_function_call(task, event_function, &efs))
return;
raw_spin_lock_irq(&ctx->lock);
+ /*
+ * Reload the task pointer, it might have been changed by
+ * a concurrent perf_event_context_sched_out().
+ */
+ task = ctx->task;
+ if (task == TASK_TOMBSTONE) {
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
if (ctx->is_active) {
- /*
- * Reload the task pointer, it might have been changed by
- * a concurrent perf_event_context_sched_out().
- */
- task = ctx->task;
raw_spin_unlock_irq(&ctx->lock);
goto again;
}
- inactive(data);
+ func(event, NULL, ctx, data);
raw_spin_unlock_irq(&ctx->lock);
}
-#define EVENT_OWNER_KERNEL ((void *) -1)
-
-static bool is_kernel_event(struct perf_event *event)
-{
- return event->owner == EVENT_OWNER_KERNEL;
-}
-
#define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
PERF_FLAG_FD_OUTPUT |\
PERF_FLAG_PID_CGROUP |\
@@ -179,6 +316,7 @@ static bool is_kernel_event(struct perf_event *event)
enum event_type_t {
EVENT_FLEXIBLE = 0x1,
EVENT_PINNED = 0x2,
+ EVENT_TIME = 0x4,
EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
};
@@ -186,7 +324,13 @@ enum event_type_t {
* perf_sched_events : >0 events exist
* perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
*/
-struct static_key_deferred perf_sched_events __read_mostly;
+
+static void perf_sched_delayed(struct work_struct *work);
+DEFINE_STATIC_KEY_FALSE(perf_sched_events);
+static DECLARE_DELAYED_WORK(perf_sched_work, perf_sched_delayed);
+static DEFINE_MUTEX(perf_sched_mutex);
+static atomic_t perf_sched_count;
+
static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
static DEFINE_PER_CPU(int, perf_sched_cb_usages);
@@ -368,28 +512,6 @@ static inline u64 perf_event_clock(struct perf_event *event)
return event->clock();
}
-static inline struct perf_cpu_context *
-__get_cpu_context(struct perf_event_context *ctx)
-{
- return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
-}
-
-static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- raw_spin_lock(&cpuctx->ctx.lock);
- if (ctx)
- raw_spin_lock(&ctx->lock);
-}
-
-static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
- struct perf_event_context *ctx)
-{
- if (ctx)
- raw_spin_unlock(&ctx->lock);
- raw_spin_unlock(&cpuctx->ctx.lock);
-}
-
#ifdef CONFIG_CGROUP_PERF
static inline bool
@@ -579,13 +701,7 @@ static inline void perf_cgroup_sched_out(struct task_struct *task,
* we are holding the rcu lock
*/
cgrp1 = perf_cgroup_from_task(task, NULL);
-
- /*
- * next is NULL when called from perf_event_enable_on_exec()
- * that will systematically cause a cgroup_switch()
- */
- if (next)
- cgrp2 = perf_cgroup_from_task(next, NULL);
+ cgrp2 = perf_cgroup_from_task(next, NULL);
/*
* only schedule out current cgroup events if we know
@@ -611,8 +727,6 @@ static inline void perf_cgroup_sched_in(struct task_struct *prev,
* we are holding the rcu lock
*/
cgrp1 = perf_cgroup_from_task(task, NULL);
-
- /* prev can never be NULL */
cgrp2 = perf_cgroup_from_task(prev, NULL);
/*
@@ -917,7 +1031,7 @@ static void put_ctx(struct perf_event_context *ctx)
if (atomic_dec_and_test(&ctx->refcount)) {
if (ctx->parent_ctx)
put_ctx(ctx->parent_ctx);
- if (ctx->task)
+ if (ctx->task && ctx->task != TASK_TOMBSTONE)
put_task_struct(ctx->task);
call_rcu(&ctx->rcu_head, free_ctx);
}
@@ -934,9 +1048,8 @@ static void put_ctx(struct perf_event_context *ctx)
* perf_event_context::mutex nests and those are:
*
* - perf_event_exit_task_context() [ child , 0 ]
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event() [ parent, 1 ]
+ * perf_event_exit_event()
+ * put_event() [ parent, 1 ]
*
* - perf_event_init_context() [ parent, 0 ]
* inherit_task_group()
@@ -979,8 +1092,8 @@ static void put_ctx(struct perf_event_context *ctx)
* Lock order:
* task_struct::perf_event_mutex
* perf_event_context::mutex
- * perf_event_context::lock
* perf_event::child_mutex;
+ * perf_event_context::lock
* perf_event::mmap_mutex
* mmap_sem
*/
@@ -1078,6 +1191,7 @@ static u64 primary_event_id(struct perf_event *event)
/*
* Get the perf_event_context for a task and lock it.
+ *
* This has to cope with with the fact that until it is locked,
* the context could get moved to another task.
*/
@@ -1118,9 +1232,12 @@ retry:
goto retry;
}
- if (!atomic_inc_not_zero(&ctx->refcount)) {
+ if (ctx->task == TASK_TOMBSTONE ||
+ !atomic_inc_not_zero(&ctx->refcount)) {
raw_spin_unlock(&ctx->lock);
ctx = NULL;
+ } else {
+ WARN_ON_ONCE(ctx->task != task);
}
}
rcu_read_unlock();
@@ -1180,16 +1297,18 @@ static u64 perf_event_time(struct perf_event *event)
/*
* Update the total_time_enabled and total_time_running fields for a event.
- * The caller of this function needs to hold the ctx->lock.
*/
static void update_event_times(struct perf_event *event)
{
struct perf_event_context *ctx = event->ctx;
u64 run_end;
+ lockdep_assert_held(&ctx->lock);
+
if (event->state < PERF_EVENT_STATE_INACTIVE ||
event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
return;
+
/*
* in cgroup mode, time_enabled represents
* the time the event was enabled AND active
@@ -1246,6 +1365,8 @@ ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
static void
list_add_event(struct perf_event *event, struct perf_event_context *ctx)
{
+ lockdep_assert_held(&ctx->lock);
+
WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
event->attach_state |= PERF_ATTACH_CONTEXT;
@@ -1448,11 +1569,14 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
if (is_cgroup_event(event)) {
ctx->nr_cgroups--;
+ /*
+ * Because cgroup events are always per-cpu events, this will
+ * always be called from the right CPU.
+ */
cpuctx = __get_cpu_context(ctx);
/*
- * if there are no more cgroup events
- * then cler cgrp to avoid stale pointer
- * in update_cgrp_time_from_cpuctx()
+ * If there are no more cgroup events then clear cgrp to avoid
+ * stale pointer in update_cgrp_time_from_cpuctx().
*/
if (!ctx->nr_cgroups)
cpuctx->cgrp = NULL;
@@ -1530,45 +1654,11 @@ out:
perf_event__header_size(tmp);
}
-/*
- * User event without the task.
- */
static bool is_orphaned_event(struct perf_event *event)
{
- return event && !is_kernel_event(event) && !event->owner;
+ return event->state == PERF_EVENT_STATE_DEAD;
}
-/*
- * Event has a parent but parent's task finished and it's
- * alive only because of children holding refference.
- */
-static bool is_orphaned_child(struct perf_event *event)
-{
- return is_orphaned_event(event->parent);
-}
-
-static void orphans_remove_work(struct work_struct *work);
-
-static void schedule_orphans_remove(struct perf_event_context *ctx)
-{
- if (!ctx->task || ctx->orphans_remove_sched || !perf_wq)
- return;
-
- if (queue_delayed_work(perf_wq, &ctx->orphans_remove, 1)) {
- get_ctx(ctx);
- ctx->orphans_remove_sched = true;
- }
-}
-
-static int __init perf_workqueue_init(void)
-{
- perf_wq = create_singlethread_workqueue("perf");
- WARN(!perf_wq, "failed to create perf workqueue\n");
- return perf_wq ? 0 : -1;
-}
-
-core_initcall(perf_workqueue_init);
-
static inline int pmu_filter_match(struct perf_event *event)
{
struct pmu *pmu = event->pmu;
@@ -1611,14 +1701,14 @@ event_sched_out(struct perf_event *event,
perf_pmu_disable(event->pmu);
+ event->tstamp_stopped = tstamp;
+ event->pmu->del(event, 0);
+ event->oncpu = -1;
event->state = PERF_EVENT_STATE_INACTIVE;
if (event->pending_disable) {
event->pending_disable = 0;
event->state = PERF_EVENT_STATE_OFF;
}
- event->tstamp_stopped = tstamp;
- event->pmu->del(event, 0);
- event->oncpu = -1;
if (!is_software_event(event))
cpuctx->active_oncpu--;
@@ -1629,9 +1719,6 @@ event_sched_out(struct perf_event *event,
if (event->attr.exclusive || !cpuctx->active_oncpu)
cpuctx->exclusive = 0;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
perf_pmu_enable(event->pmu);
}
@@ -1655,21 +1742,7 @@ group_sched_out(struct perf_event *group_event,
cpuctx->exclusive = 0;
}
-struct remove_event {
- struct perf_event *event;
- bool detach_group;
-};
-
-static void ___perf_remove_from_context(void *info)
-{
- struct remove_event *re = info;
- struct perf_event *event = re->event;
- struct perf_event_context *ctx = event->ctx;
-
- if (re->detach_group)
- perf_group_detach(event);
- list_del_event(event, ctx);
-}
+#define DETACH_GROUP 0x01UL
/*
* Cross CPU call to remove a performance event
@@ -1677,33 +1750,31 @@ static void ___perf_remove_from_context(void *info)
* We disable the event on the hardware level first. After that we
* remove it from the context list.
*/
-static int __perf_remove_from_context(void *info)
+static void
+__perf_remove_from_context(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct remove_event *re = info;
- struct perf_event *event = re->event;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ unsigned long flags = (unsigned long)info;
- raw_spin_lock(&ctx->lock);
event_sched_out(event, cpuctx, ctx);
- if (re->detach_group)
+ if (flags & DETACH_GROUP)
perf_group_detach(event);
list_del_event(event, ctx);
- if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
+
+ if (!ctx->nr_events && ctx->is_active) {
ctx->is_active = 0;
- cpuctx->task_ctx = NULL;
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ cpuctx->task_ctx = NULL;
+ }
}
- raw_spin_unlock(&ctx->lock);
-
- return 0;
}
/*
* Remove the event from a task's (or a CPU's) list of events.
*
- * CPU events are removed with a smp call. For task events we only
- * call when the task is on a CPU.
- *
* If event->ctx is a cloned context, callers must make sure that
* every task struct that event->ctx->task could possibly point to
* remains valid. This is OK when called from perf_release since
@@ -1711,73 +1782,32 @@ static int __perf_remove_from_context(void *info)
* When called from perf_event_exit_task, it's OK because the
* context has been detached from its task.
*/
-static void perf_remove_from_context(struct perf_event *event, bool detach_group)
+static void perf_remove_from_context(struct perf_event *event, unsigned long flags)
{
- struct perf_event_context *ctx = event->ctx;
- struct remove_event re = {
- .event = event,
- .detach_group = detach_group,
- };
-
- lockdep_assert_held(&ctx->mutex);
+ lockdep_assert_held(&event->ctx->mutex);
- event_function_call(event, __perf_remove_from_context,
- ___perf_remove_from_context, &re);
+ event_function_call(event, __perf_remove_from_context, (void *)flags);
}
/*
* Cross CPU call to disable a performance event
*/
-int __perf_event_disable(void *info)
-{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- /*
- * If this is a per-task event, need to check whether this
- * event's task is the current task on this cpu.
- *
- * Can trigger due to concurrent perf_event_context_sched_out()
- * flipping contexts around.
- */
- if (ctx->task && cpuctx->task_ctx != ctx)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
-
- /*
- * If the event is on, turn it off.
- * If it is in error state, leave it in error state.
- */
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
- update_context_time(ctx);
- update_cgrp_time_from_event(event);
- update_group_times(event);
- if (event == event->group_leader)
- group_sched_out(event, cpuctx, ctx);
- else
- event_sched_out(event, cpuctx, ctx);
- event->state = PERF_EVENT_STATE_OFF;
- }
-
- raw_spin_unlock(&ctx->lock);
-
- return 0;
-}
-
-void ___perf_event_disable(void *info)
+static void __perf_event_disable(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event *event = info;
+ if (event->state < PERF_EVENT_STATE_INACTIVE)
+ return;
- /*
- * Since we have the lock this context can't be scheduled
- * in, so we can change the state safely.
- */
- if (event->state == PERF_EVENT_STATE_INACTIVE) {
- update_group_times(event);
- event->state = PERF_EVENT_STATE_OFF;
- }
+ update_context_time(ctx);
+ update_cgrp_time_from_event(event);
+ update_group_times(event);
+ if (event == event->group_leader)
+ group_sched_out(event, cpuctx, ctx);
+ else
+ event_sched_out(event, cpuctx, ctx);
+ event->state = PERF_EVENT_STATE_OFF;
}
/*
@@ -1788,7 +1818,8 @@ void ___perf_event_disable(void *info)
* remains valid. This condition is satisifed when called through
* perf_event_for_each_child or perf_event_for_each because they
* hold the top-level event's child_mutex, so any descendant that
- * goes to exit will block in sync_child_event.
+ * goes to exit will block in perf_event_exit_event().
+ *
* When called from perf_pending_event it's OK because event->ctx
* is the current context on this CPU and preemption is disabled,
* hence we can't get into perf_event_task_sched_out for this context.
@@ -1804,8 +1835,12 @@ static void _perf_event_disable(struct perf_event *event)
}
raw_spin_unlock_irq(&ctx->lock);
- event_function_call(event, __perf_event_disable,
- ___perf_event_disable, event);
+ event_function_call(event, __perf_event_disable, NULL);
+}
+
+void perf_event_disable_local(struct perf_event *event)
+{
+ event_function_local(event, __perf_event_disable, NULL);
}
/*
@@ -1918,9 +1953,6 @@ event_sched_in(struct perf_event *event,
if (event->attr.exclusive)
cpuctx->exclusive = 1;
- if (is_orphaned_child(event))
- schedule_orphans_remove(ctx);
-
out:
perf_pmu_enable(event->pmu);
@@ -2039,13 +2071,27 @@ static void add_event_to_ctx(struct perf_event *event,
event->tstamp_stopped = tstamp;
}
-static void task_ctx_sched_out(struct perf_event_context *ctx);
+static void ctx_sched_out(struct perf_event_context *ctx,
+ struct perf_cpu_context *cpuctx,
+ enum event_type_t event_type);
static void
ctx_sched_in(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type,
struct task_struct *task);
+static void task_ctx_sched_out(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx)
+{
+ if (!cpuctx->task_ctx)
+ return;
+
+ if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
+ return;
+
+ ctx_sched_out(ctx, cpuctx, EVENT_ALL);
+}
+
static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
struct perf_event_context *ctx,
struct task_struct *task)
@@ -2058,22 +2104,22 @@ static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
}
-static void ___perf_install_in_context(void *info)
+static void ctx_resched(struct perf_cpu_context *cpuctx,
+ struct perf_event_context *task_ctx)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
-
- /*
- * Since the task isn't running, its safe to add the event, us holding
- * the ctx->lock ensures the task won't get scheduled in.
- */
- add_event_to_ctx(event, ctx);
+ perf_pmu_disable(cpuctx->ctx.pmu);
+ if (task_ctx)
+ task_ctx_sched_out(cpuctx, task_ctx);
+ cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ perf_event_sched_in(cpuctx, task_ctx, current);
+ perf_pmu_enable(cpuctx->ctx.pmu);
}
/*
* Cross CPU call to install and enable a performance event
*
- * Must be called with ctx->mutex held
+ * Very similar to remote_function() + event_function() but cannot assume that
+ * things like ctx->is_active and cpuctx->task_ctx are set.
*/
static int __perf_install_in_context(void *info)
{
@@ -2081,79 +2127,106 @@ static int __perf_install_in_context(void *info)
struct perf_event_context *ctx = event->ctx;
struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
struct perf_event_context *task_ctx = cpuctx->task_ctx;
- struct task_struct *task = current;
-
- perf_ctx_lock(cpuctx, task_ctx);
- perf_pmu_disable(cpuctx->ctx.pmu);
-
- /*
- * If there was an active task_ctx schedule it out.
- */
- if (task_ctx)
- task_ctx_sched_out(task_ctx);
+ bool activate = true;
+ int ret = 0;
- /*
- * If the context we're installing events in is not the
- * active task_ctx, flip them.
- */
- if (ctx->task && task_ctx != ctx) {
- if (task_ctx)
- raw_spin_unlock(&task_ctx->lock);
+ raw_spin_lock(&cpuctx->ctx.lock);
+ if (ctx->task) {
raw_spin_lock(&ctx->lock);
task_ctx = ctx;
- }
-
- if (task_ctx) {
- cpuctx->task_ctx = task_ctx;
- task = task_ctx->task;
- }
- cpu_ctx_sched_out(cpuctx, EVENT_ALL);
+ /* If we're on the wrong CPU, try again */
+ if (task_cpu(ctx->task) != smp_processor_id()) {
+ ret = -ESRCH;
+ goto unlock;
+ }
- update_context_time(ctx);
- /*
- * update cgrp time only if current cgrp
- * matches event->cgrp. Must be done before
- * calling add_event_to_ctx()
- */
- update_cgrp_time_from_event(event);
+ /*
+ * If we're on the right CPU, see if the task we target is
+ * current, if not we don't have to activate the ctx, a future
+ * context switch will do that for us.
+ */
+ if (ctx->task != current)
+ activate = false;
+ else
+ WARN_ON_ONCE(cpuctx->task_ctx && cpuctx->task_ctx != ctx);
- add_event_to_ctx(event, ctx);
+ } else if (task_ctx) {
+ raw_spin_lock(&task_ctx->lock);
+ }
- /*
- * Schedule everything back in
- */
- perf_event_sched_in(cpuctx, task_ctx, task);
+ if (activate) {
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ add_event_to_ctx(event, ctx);
+ ctx_resched(cpuctx, task_ctx);
+ } else {
+ add_event_to_ctx(event, ctx);
+ }
- perf_pmu_enable(cpuctx->ctx.pmu);
+unlock:
perf_ctx_unlock(cpuctx, task_ctx);
- return 0;
+ return ret;
}
/*
- * Attach a performance event to a context
- *
- * First we add the event to the list with the hardware enable bit
- * in event->hw_config cleared.
+ * Attach a performance event to a context.
*
- * If the event is attached to a task which is on a CPU we use a smp
- * call to enable it in the task context. The task might have been
- * scheduled away, but we check this in the smp call again.
+ * Very similar to event_function_call, see comment there.
*/
static void
perf_install_in_context(struct perf_event_context *ctx,
struct perf_event *event,
int cpu)
{
+ struct task_struct *task = READ_ONCE(ctx->task);
+
lockdep_assert_held(&ctx->mutex);
event->ctx = ctx;
if (event->cpu != -1)
event->cpu = cpu;
- event_function_call(event, __perf_install_in_context,
- ___perf_install_in_context, event);
+ if (!task) {
+ cpu_function_call(cpu, __perf_install_in_context, event);
+ return;
+ }
+
+ /*
+ * Should not happen, we validate the ctx is still alive before calling.
+ */
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE))
+ return;
+
+ /*
+ * Installing events is tricky because we cannot rely on ctx->is_active
+ * to be set in case this is the nr_events 0 -> 1 transition.
+ */
+again:
+ /*
+ * Cannot use task_function_call() because we need to run on the task's
+ * CPU regardless of whether its current or not.
+ */
+ if (!cpu_function_call(task_cpu(task), __perf_install_in_context, event))
+ return;
+
+ raw_spin_lock_irq(&ctx->lock);
+ task = ctx->task;
+ if (WARN_ON_ONCE(task == TASK_TOMBSTONE)) {
+ /*
+ * Cannot happen because we already checked above (which also
+ * cannot happen), and we hold ctx->mutex, which serializes us
+ * against perf_event_exit_task_context().
+ */
+ raw_spin_unlock_irq(&ctx->lock);
+ return;
+ }
+ raw_spin_unlock_irq(&ctx->lock);
+ /*
+ * Since !ctx->is_active doesn't mean anything, we must IPI
+ * unconditionally.
+ */
+ goto again;
}
/*
@@ -2180,85 +2253,47 @@ static void __perf_event_mark_enabled(struct perf_event *event)
/*
* Cross CPU call to enable a performance event
*/
-static int __perf_event_enable(void *info)
+static void __perf_event_enable(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct perf_event *event = info;
- struct perf_event_context *ctx = event->ctx;
struct perf_event *leader = event->group_leader;
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
- int err;
+ struct perf_event_context *task_ctx;
- /*
- * There's a time window between 'ctx->is_active' check
- * in perf_event_enable function and this place having:
- * - IRQs on
- * - ctx->lock unlocked
- *
- * where the task could be killed and 'ctx' deactivated
- * by perf_event_exit_task.
- */
- if (!ctx->is_active)
- return -EINVAL;
-
- raw_spin_lock(&ctx->lock);
- update_context_time(ctx);
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state <= PERF_EVENT_STATE_ERROR)
+ return;
- if (event->state >= PERF_EVENT_STATE_INACTIVE)
- goto unlock;
-
- /*
- * set current task's cgroup time reference point
- */
- perf_cgroup_set_timestamp(current, ctx);
+ if (ctx->is_active)
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
__perf_event_mark_enabled(event);
+ if (!ctx->is_active)
+ return;
+
if (!event_filter_match(event)) {
if (is_cgroup_event(event))
perf_cgroup_defer_enabled(event);
- goto unlock;
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ return;
}
/*
* If the event is in a group and isn't the group leader,
* then don't put it on unless the group is on.
*/
- if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
- goto unlock;
-
- if (!group_can_go_on(event, cpuctx, 1)) {
- err = -EEXIST;
- } else {
- if (event == leader)
- err = group_sched_in(event, cpuctx, ctx);
- else
- err = event_sched_in(event, cpuctx, ctx);
- }
-
- if (err) {
- /*
- * If this event can't go on and it's part of a
- * group, then the whole group has to come off.
- */
- if (leader != event) {
- group_sched_out(leader, cpuctx, ctx);
- perf_mux_hrtimer_restart(cpuctx);
- }
- if (leader->attr.pinned) {
- update_group_times(leader);
- leader->state = PERF_EVENT_STATE_ERROR;
- }
+ if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE) {
+ ctx_sched_in(ctx, cpuctx, EVENT_TIME, current);
+ return;
}
-unlock:
- raw_spin_unlock(&ctx->lock);
-
- return 0;
-}
+ task_ctx = cpuctx->task_ctx;
+ if (ctx->task)
+ WARN_ON_ONCE(task_ctx != ctx);
-void ___perf_event_enable(void *info)
-{
- __perf_event_mark_enabled((struct perf_event *)info);
+ ctx_resched(cpuctx, task_ctx);
}
/*
@@ -2275,7 +2310,8 @@ static void _perf_event_enable(struct perf_event *event)
struct perf_event_context *ctx = event->ctx;
raw_spin_lock_irq(&ctx->lock);
- if (event->state >= PERF_EVENT_STATE_INACTIVE) {
+ if (event->state >= PERF_EVENT_STATE_INACTIVE ||
+ event->state < PERF_EVENT_STATE_ERROR) {
raw_spin_unlock_irq(&ctx->lock);
return;
}
@@ -2291,8 +2327,7 @@ static void _perf_event_enable(struct perf_event *event)
event->state = PERF_EVENT_STATE_OFF;
raw_spin_unlock_irq(&ctx->lock);
- event_function_call(event, __perf_event_enable,
- ___perf_event_enable, event);
+ event_function_call(event, __perf_event_enable, NULL);
}
/*
@@ -2342,25 +2377,49 @@ static void ctx_sched_out(struct perf_event_context *ctx,
struct perf_cpu_context *cpuctx,
enum event_type_t event_type)
{
- struct perf_event *event;
int is_active = ctx->is_active;
+ struct perf_event *event;
- ctx->is_active &= ~event_type;
- if (likely(!ctx->nr_events))
+ lockdep_assert_held(&ctx->lock);
+
+ if (likely(!ctx->nr_events)) {
+ /*
+ * See __perf_remove_from_context().
+ */
+ WARN_ON_ONCE(ctx->is_active);
+ if (ctx->task)
+ WARN_ON_ONCE(cpuctx->task_ctx);
return;
+ }
- update_context_time(ctx);
- update_cgrp_time_from_cpuctx(cpuctx);
- if (!ctx->nr_active)
+ ctx->is_active &= ~event_type;
+ if (!(ctx->is_active & EVENT_ALL))
+ ctx->is_active = 0;
+
+ if (ctx->task) {
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ if (!ctx->is_active)
+ cpuctx->task_ctx = NULL;
+ }
+
+ is_active ^= ctx->is_active; /* changed bits */
+
+ if (is_active & EVENT_TIME) {
+ /* update (and stop) ctx time */
+ update_context_time(ctx);
+ update_cgrp_time_from_cpuctx(cpuctx);
+ }
+
+ if (!ctx->nr_active || !(is_active & EVENT_ALL))
return;
perf_pmu_disable(ctx->pmu);
- if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
+ if (is_active & EVENT_PINNED) {
list_for_each_entry(event, &ctx->pinned_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
- if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
+ if (is_active & EVENT_FLEXIBLE) {
list_for_each_entry(event, &ctx->flexible_groups, group_entry)
group_sched_out(event, cpuctx, ctx);
}
@@ -2518,17 +2577,21 @@ static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
raw_spin_lock(&ctx->lock);
raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
if (context_equiv(ctx, next_ctx)) {
- /*
- * XXX do we need a memory barrier of sorts
- * wrt to rcu_dereference() of perf_event_ctxp
- */
- task->perf_event_ctxp[ctxn] = next_ctx;
- next->perf_event_ctxp[ctxn] = ctx;
- ctx->task = next;
- next_ctx->task = task;
+ WRITE_ONCE(ctx->task, next);
+ WRITE_ONCE(next_ctx->task, task);
swap(ctx->task_ctx_data, next_ctx->task_ctx_data);
+ /*
+ * RCU_INIT_POINTER here is safe because we've not
+ * modified the ctx and the above modification of
+ * ctx->task and ctx->task_ctx_data are immaterial
+ * since those values are always verified under
+ * ctx->lock which we're now holding.
+ */
+ RCU_INIT_POINTER(task->perf_event_ctxp[ctxn], next_ctx);
+ RCU_INIT_POINTER(next->perf_event_ctxp[ctxn], ctx);
+
do_switch = 0;
perf_event_sync_stat(ctx, next_ctx);
@@ -2541,8 +2604,7 @@ unlock:
if (do_switch) {
raw_spin_lock(&ctx->lock);
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
+ task_ctx_sched_out(cpuctx, ctx);
raw_spin_unlock(&ctx->lock);
}
}
@@ -2637,20 +2699,6 @@ void __perf_event_task_sched_out(struct task_struct *task,
perf_cgroup_sched_out(task, next);
}
-static void task_ctx_sched_out(struct perf_event_context *ctx)
-{
- struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
-
- if (!cpuctx->task_ctx)
- return;
-
- if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
- return;
-
- ctx_sched_out(ctx, cpuctx, EVENT_ALL);
- cpuctx->task_ctx = NULL;
-}
-
/*
* Called with IRQs disabled
*/
@@ -2725,25 +2773,40 @@ ctx_sched_in(struct perf_event_context *ctx,
enum event_type_t event_type,
struct task_struct *task)
{
- u64 now;
int is_active = ctx->is_active;
+ u64 now;
+
+ lockdep_assert_held(&ctx->lock);
- ctx->is_active |= event_type;
if (likely(!ctx->nr_events))
return;
- now = perf_clock();
- ctx->timestamp = now;
- perf_cgroup_set_timestamp(task, ctx);
+ ctx->is_active |= (event_type | EVENT_TIME);
+ if (ctx->task) {
+ if (!is_active)
+ cpuctx->task_ctx = ctx;
+ else
+ WARN_ON_ONCE(cpuctx->task_ctx != ctx);
+ }
+
+ is_active ^= ctx->is_active; /* changed bits */
+
+ if (is_active & EVENT_TIME) {
+ /* start ctx time */
+ now = perf_clock();
+ ctx->timestamp = now;
+ perf_cgroup_set_timestamp(task, ctx);
+ }
+
/*
* First go through the list and put on any pinned groups
* in order to give them the best chance of going on.
*/
- if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
+ if (is_active & EVENT_PINNED)
ctx_pinned_sched_in(ctx, cpuctx);
/* Then walk through the lower prio flexible groups */
- if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
+ if (is_active & EVENT_FLEXIBLE)
ctx_flexible_sched_in(ctx, cpuctx);
}
@@ -2773,12 +2836,7 @@ static void perf_event_context_sched_in(struct perf_event_context *ctx,
* cpu flexible, task flexible.
*/
cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
-
- if (ctx->nr_events)
- cpuctx->task_ctx = ctx;
-
- perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
-
+ perf_event_sched_in(cpuctx, ctx, task);
perf_pmu_enable(ctx->pmu);
perf_ctx_unlock(cpuctx, ctx);
}
@@ -2800,6 +2858,16 @@ void __perf_event_task_sched_in(struct task_struct *prev,
struct perf_event_context *ctx;
int ctxn;
+ /*
+ * If cgroup events exist on this CPU, then we need to check if we have
+ * to switch in PMU state; cgroup event are system-wide mode only.
+ *
+ * Since cgroup events are CPU events, we must schedule these in before
+ * we schedule in the task events.
+ */
+ if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
+ perf_cgroup_sched_in(prev, task);
+
for_each_task_context_nr(ctxn) {
ctx = task->perf_event_ctxp[ctxn];
if (likely(!ctx))
@@ -2807,13 +2875,6 @@ void __perf_event_task_sched_in(struct task_struct *prev,
perf_event_context_sched_in(ctx, task);
}
- /*
- * if cgroup events exist on this CPU, then we need
- * to check if we have to switch in PMU state.
- * cgroup event are system-wide mode only
- */
- if (atomic_read(this_cpu_ptr(&perf_cgroup_events)))
- perf_cgroup_sched_in(prev, task);
if (atomic_read(&nr_switch_events))
perf_event_switch(task, prev, true);
@@ -3099,46 +3160,31 @@ static int event_enable_on_exec(struct perf_event *event,
static void perf_event_enable_on_exec(int ctxn)
{
struct perf_event_context *ctx, *clone_ctx = NULL;
+ struct perf_cpu_context *cpuctx;
struct perf_event *event;
unsigned long flags;
int enabled = 0;
- int ret;
local_irq_save(flags);
ctx = current->perf_event_ctxp[ctxn];
if (!ctx || !ctx->nr_events)
goto out;
- /*
- * We must ctxsw out cgroup events to avoid conflict
- * when invoking perf_task_event_sched_in() later on
- * in this function. Otherwise we end up trying to
- * ctxswin cgroup events which are already scheduled
- * in.
- */
- perf_cgroup_sched_out(current, NULL);
-
- raw_spin_lock(&ctx->lock);
- task_ctx_sched_out(ctx);
-
- list_for_each_entry(event, &ctx->event_list, event_entry) {
- ret = event_enable_on_exec(event, ctx);
- if (ret)
- enabled = 1;
- }
+ cpuctx = __get_cpu_context(ctx);
+ perf_ctx_lock(cpuctx, ctx);
+ ctx_sched_out(ctx, cpuctx, EVENT_TIME);
+ list_for_each_entry(event, &ctx->event_list, event_entry)
+ enabled |= event_enable_on_exec(event, ctx);
/*
- * Unclone this context if we enabled any event.
+ * Unclone and reschedule this context if we enabled any event.
*/
- if (enabled)
+ if (enabled) {
clone_ctx = unclone_ctx(ctx);
+ ctx_resched(cpuctx, ctx);
+ }
+ perf_ctx_unlock(cpuctx, ctx);
- raw_spin_unlock(&ctx->lock);
-
- /*
- * Also calls ctxswin for cgroup events, if any:
- */
- perf_event_context_sched_in(ctx, ctx->task);
out:
local_irq_restore(flags);
@@ -3334,7 +3380,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx)
INIT_LIST_HEAD(&ctx->flexible_groups);
INIT_LIST_HEAD(&ctx->event_list);
atomic_set(&ctx->refcount, 1);
- INIT_DELAYED_WORK(&ctx->orphans_remove, orphans_remove_work);
}
static struct perf_event_context *
@@ -3521,11 +3566,13 @@ static void unaccount_event_cpu(struct perf_event *event, int cpu)
static void unaccount_event(struct perf_event *event)
{
+ bool dec = false;
+
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_dec(&nr_mmap_events);
if (event->attr.comm)
@@ -3535,17 +3582,30 @@ static void unaccount_event(struct perf_event *event)
if (event->attr.freq)
atomic_dec(&nr_freq_events);
if (event->attr.context_switch) {
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
atomic_dec(&nr_switch_events);
}
if (is_cgroup_event(event))
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
if (has_branch_stack(event))
- static_key_slow_dec_deferred(&perf_sched_events);
+ dec = true;
+
+ if (dec) {
+ if (!atomic_add_unless(&perf_sched_count, -1, 1))
+ schedule_delayed_work(&perf_sched_work, HZ);
+ }
unaccount_event_cpu(event, event->cpu);
}
+static void perf_sched_delayed(struct work_struct *work)
+{
+ mutex_lock(&perf_sched_mutex);
+ if (atomic_dec_and_test(&perf_sched_count))
+ static_branch_disable(&perf_sched_events);
+ mutex_unlock(&perf_sched_mutex);
+}
+
/*
* The following implement mutual exclusion of events on "exclusive" pmus
* (PERF_PMU_CAP_EXCLUSIVE). Such pmus can only have one event scheduled
@@ -3556,7 +3616,7 @@ static void unaccount_event(struct perf_event *event)
* 3) two matching events on the same context.
*
* The former two cases are handled in the allocation path (perf_event_alloc(),
- * __free_event()), the latter -- before the first perf_install_in_context().
+ * _free_event()), the latter -- before the first perf_install_in_context().
*/
static int exclusive_event_init(struct perf_event *event)
{
@@ -3631,29 +3691,6 @@ static bool exclusive_event_installable(struct perf_event *event,
return true;
}
-static void __free_event(struct perf_event *event)
-{
- if (!event->parent) {
- if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
- put_callchain_buffers();
- }
-
- perf_event_free_bpf_prog(event);
-
- if (event->destroy)
- event->destroy(event);
-
- if (event->ctx)
- put_ctx(event->ctx);
-
- if (event->pmu) {
- exclusive_event_destroy(event);
- module_put(event->pmu->module);
- }
-
- call_rcu(&event->rcu_head, free_event_rcu);
-}
-
static void _free_event(struct perf_event *event)
{
irq_work_sync(&event->pending);
@@ -3675,7 +3712,25 @@ static void _free_event(struct perf_event *event)
if (is_cgroup_event(event))
perf_detach_cgroup(event);
- __free_event(event);
+ if (!event->parent) {
+ if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+ put_callchain_buffers();
+ }
+
+ perf_event_free_bpf_prog(event);
+
+ if (event->destroy)
+ event->destroy(event);
+
+ if (event->ctx)
+ put_ctx(event->ctx);
+
+ if (event->pmu) {
+ exclusive_event_destroy(event);
+ module_put(event->pmu->module);
+ }
+
+ call_rcu(&event->rcu_head, free_event_rcu);
}
/*
@@ -3702,14 +3757,13 @@ static void perf_remove_from_owner(struct perf_event *event)
struct task_struct *owner;
rcu_read_lock();
- owner = ACCESS_ONCE(event->owner);
/*
- * Matches the smp_wmb() in perf_event_exit_task(). If we observe
- * !owner it means the list deletion is complete and we can indeed
- * free this event, otherwise we need to serialize on
+ * Matches the smp_store_release() in perf_event_exit_task(). If we
+ * observe !owner it means the list deletion is complete and we can
+ * indeed free this event, otherwise we need to serialize on
* owner->perf_event_mutex.
*/
- smp_read_barrier_depends();
+ owner = lockless_dereference(event->owner);
if (owner) {
/*
* Since delayed_put_task_struct() also drops the last
@@ -3737,8 +3791,10 @@ static void perf_remove_from_owner(struct perf_event *event)
* ensured they're done, and we can proceed with freeing the
* event.
*/
- if (event->owner)
+ if (event->owner) {
list_del_init(&event->owner_entry);
+ smp_store_release(&event->owner, NULL);
+ }
mutex_unlock(&owner->perf_event_mutex);
put_task_struct(owner);
}
@@ -3746,37 +3802,111 @@ static void perf_remove_from_owner(struct perf_event *event)
static void put_event(struct perf_event *event)
{
- struct perf_event_context *ctx;
-
if (!atomic_long_dec_and_test(&event->refcount))
return;
+ _free_event(event);
+}
+
+/*
+ * Kill an event dead; while event:refcount will preserve the event
+ * object, it will not preserve its functionality. Once the last 'user'
+ * gives up the object, we'll destroy the thing.
+ */
+int perf_event_release_kernel(struct perf_event *event)
+{
+ struct perf_event_context *ctx = event->ctx;
+ struct perf_event *child, *tmp;
+
+ /*
+ * If we got here through err_file: fput(event_file); we will not have
+ * attached to a context yet.
+ */
+ if (!ctx) {
+ WARN_ON_ONCE(event->attach_state &
+ (PERF_ATTACH_CONTEXT|PERF_ATTACH_GROUP));
+ goto no_ctx;
+ }
+
if (!is_kernel_event(event))
perf_remove_from_owner(event);
+ ctx = perf_event_ctx_lock(event);
+ WARN_ON_ONCE(ctx->parent_ctx);
+ perf_remove_from_context(event, DETACH_GROUP);
+
+ raw_spin_lock_irq(&ctx->lock);
/*
- * There are two ways this annotation is useful:
+ * Mark this even as STATE_DEAD, there is no external reference to it
+ * anymore.
*
- * 1) there is a lock recursion from perf_event_exit_task
- * see the comment there.
+ * Anybody acquiring event->child_mutex after the below loop _must_
+ * also see this, most importantly inherit_event() which will avoid
+ * placing more children on the list.
*
- * 2) there is a lock-inversion with mmap_sem through
- * perf_read_group(), which takes faults while
- * holding ctx->mutex, however this is called after
- * the last filedesc died, so there is no possibility
- * to trigger the AB-BA case.
+ * Thus this guarantees that we will in fact observe and kill _ALL_
+ * child events.
*/
- ctx = perf_event_ctx_lock_nested(event, SINGLE_DEPTH_NESTING);
- WARN_ON_ONCE(ctx->parent_ctx);
- perf_remove_from_context(event, true);
+ event->state = PERF_EVENT_STATE_DEAD;
+ raw_spin_unlock_irq(&ctx->lock);
+
perf_event_ctx_unlock(event, ctx);
- _free_event(event);
-}
+again:
+ mutex_lock(&event->child_mutex);
+ list_for_each_entry(child, &event->child_list, child_list) {
-int perf_event_release_kernel(struct perf_event *event)
-{
- put_event(event);
+ /*
+ * Cannot change, child events are not migrated, see the
+ * comment with perf_event_ctx_lock_nested().
+ */
+ ctx = lockless_dereference(child->ctx);
+ /*
+ * Since child_mutex nests inside ctx::mutex, we must jump
+ * through hoops. We start by grabbing a reference on the ctx.
+ *
+ * Since the event cannot get freed while we hold the
+ * child_mutex, the context must also exist and have a !0
+ * reference count.
+ */
+ get_ctx(ctx);
+
+ /*
+ * Now that we have a ctx ref, we can drop child_mutex, and
+ * acquire ctx::mutex without fear of it going away. Then we
+ * can re-acquire child_mutex.
+ */
+ mutex_unlock(&event->child_mutex);
+ mutex_lock(&ctx->mutex);
+ mutex_lock(&event->child_mutex);
+
+ /*
+ * Now that we hold ctx::mutex and child_mutex, revalidate our
+ * state, if child is still the first entry, it didn't get freed
+ * and we can continue doing so.
+ */
+ tmp = list_first_entry_or_null(&event->child_list,
+ struct perf_event, child_list);
+ if (tmp == child) {
+ perf_remove_from_context(child, DETACH_GROUP);
+ list_del(&child->child_list);
+ free_event(child);
+ /*
+ * This matches the refcount bump in inherit_event();
+ * this can't be the last reference.
+ */
+ put_event(event);
+ }
+
+ mutex_unlock(&event->child_mutex);
+ mutex_unlock(&ctx->mutex);
+ put_ctx(ctx);
+ goto again;
+ }
+ mutex_unlock(&event->child_mutex);
+
+no_ctx:
+ put_event(event); /* Must be the 'last' reference */
return 0;
}
EXPORT_SYMBOL_GPL(perf_event_release_kernel);
@@ -3786,46 +3916,10 @@ EXPORT_SYMBOL_GPL(perf_event_release_kernel);
*/
static int perf_release(struct inode *inode, struct file *file)
{
- put_event(file->private_data);
+ perf_event_release_kernel(file->private_data);
return 0;
}
-/*
- * Remove all orphanes events from the context.
- */
-static void orphans_remove_work(struct work_struct *work)
-{
- struct perf_event_context *ctx;
- struct perf_event *event, *tmp;
-
- ctx = container_of(work, struct perf_event_context,
- orphans_remove.work);
-
- mutex_lock(&ctx->mutex);
- list_for_each_entry_safe(event, tmp, &ctx->event_list, event_entry) {
- struct perf_event *parent_event = event->parent;
-
- if (!is_orphaned_child(event))
- continue;
-
- perf_remove_from_context(event, true);
-
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- free_event(event);
- put_event(parent_event);
- }
-
- raw_spin_lock_irq(&ctx->lock);
- ctx->orphans_remove_sched = false;
- raw_spin_unlock_irq(&ctx->lock);
- mutex_unlock(&ctx->mutex);
-
- put_ctx(ctx);
-}
-
u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
{
struct perf_event *child;
@@ -3969,7 +4063,7 @@ static bool is_event_hup(struct perf_event *event)
{
bool no_children;
- if (event->state != PERF_EVENT_STATE_EXIT)
+ if (event->state > PERF_EVENT_STATE_EXIT)
return false;
mutex_lock(&event->child_mutex);
@@ -4054,7 +4148,7 @@ static void _perf_event_reset(struct perf_event *event)
/*
* Holding the top-level event's child_mutex means that any
* descendant process that has inherited this event will block
- * in sync_child_event if it goes to exit, thus satisfying the
+ * in perf_event_exit_event() if it goes to exit, thus satisfying the
* task existence requirements of perf_event_enable/disable.
*/
static void perf_event_for_each_child(struct perf_event *event,
@@ -4086,36 +4180,14 @@ static void perf_event_for_each(struct perf_event *event,
perf_event_for_each_child(sibling, func);
}
-struct period_event {
- struct perf_event *event;
- u64 value;
-};
-
-static void ___perf_event_period(void *info)
-{
- struct period_event *pe = info;
- struct perf_event *event = pe->event;
- u64 value = pe->value;
-
- if (event->attr.freq) {
- event->attr.sample_freq = value;
- } else {
- event->attr.sample_period = value;
- event->hw.sample_period = value;
- }
-
- local64_set(&event->hw.period_left, 0);
-}
-
-static int __perf_event_period(void *info)
+static void __perf_event_period(struct perf_event *event,
+ struct perf_cpu_context *cpuctx,
+ struct perf_event_context *ctx,
+ void *info)
{
- struct period_event *pe = info;
- struct perf_event *event = pe->event;
- struct perf_event_context *ctx = event->ctx;
- u64 value = pe->value;
+ u64 value = *((u64 *)info);
bool active;
- raw_spin_lock(&ctx->lock);
if (event->attr.freq) {
event->attr.sample_freq = value;
} else {
@@ -4135,14 +4207,10 @@ static int __perf_event_period(void *info)
event->pmu->start(event, PERF_EF_RELOAD);
perf_pmu_enable(ctx->pmu);
}
- raw_spin_unlock(&ctx->lock);
-
- return 0;
}
static int perf_event_period(struct perf_event *event, u64 __user *arg)
{
- struct period_event pe = { .event = event, };
u64 value;
if (!is_sampling_event(event))
@@ -4157,10 +4225,7 @@ static int perf_event_period(struct perf_event *event, u64 __user *arg)
if (event->attr.freq && value > sysctl_perf_event_sample_rate)
return -EINVAL;
- pe.value = value;
-
- event_function_call(event, __perf_event_period,
- ___perf_event_period, &pe);
+ event_function_call(event, __perf_event_period, &value);
return 0;
}
@@ -4932,7 +4997,7 @@ static void perf_pending_event(struct irq_work *entry)
if (event->pending_disable) {
event->pending_disable = 0;
- __perf_event_disable(event);
+ perf_event_disable_local(event);
}
if (event->pending_wakeup) {
@@ -7753,11 +7818,13 @@ static void account_event_cpu(struct perf_event *event, int cpu)
static void account_event(struct perf_event *event)
{
+ bool inc = false;
+
if (event->parent)
return;
if (event->attach_state & PERF_ATTACH_TASK)
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
if (event->attr.mmap || event->attr.mmap_data)
atomic_inc(&nr_mmap_events);
if (event->attr.comm)
@@ -7770,12 +7837,35 @@ static void account_event(struct perf_event *event)
}
if (event->attr.context_switch) {
atomic_inc(&nr_switch_events);
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
}
if (has_branch_stack(event))
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
if (is_cgroup_event(event))
- static_key_slow_inc(&perf_sched_events.key);
+ inc = true;
+
+ if (inc) {
+ if (atomic_inc_not_zero(&perf_sched_count))
+ goto enabled;
+
+ mutex_lock(&perf_sched_mutex);
+ if (!atomic_read(&perf_sched_count)) {
+ static_branch_enable(&perf_sched_events);
+ /*
+ * Guarantee that all CPUs observe they key change and
+ * call the perf scheduling hooks before proceeding to
+ * install events that need them.
+ */
+ synchronize_sched();
+ }
+ /*
+ * Now that we have waited for the sync_sched(), allow further
+ * increments to by-pass the mutex.
+ */
+ atomic_inc(&perf_sched_count);
+ mutex_unlock(&perf_sched_mutex);
+ }
+enabled:
account_event_cpu(event, event->cpu);
}
@@ -8394,10 +8484,19 @@ SYSCALL_DEFINE5(perf_event_open,
if (move_group) {
gctx = group_leader->ctx;
mutex_lock_double(&gctx->mutex, &ctx->mutex);
+ if (gctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
} else {
mutex_lock(&ctx->mutex);
}
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_locked;
+ }
+
if (!perf_event_validate_size(event)) {
err = -E2BIG;
goto err_locked;
@@ -8422,11 +8521,11 @@ SYSCALL_DEFINE5(perf_event_open,
* See perf_event_ctx_lock() for comments on the details
* of swizzling perf_event::ctx.
*/
- perf_remove_from_context(group_leader, false);
+ perf_remove_from_context(group_leader, 0);
list_for_each_entry(sibling, &group_leader->sibling_list,
group_entry) {
- perf_remove_from_context(sibling, false);
+ perf_remove_from_context(sibling, 0);
put_ctx(gctx);
}
@@ -8479,6 +8578,8 @@ SYSCALL_DEFINE5(perf_event_open,
perf_event__header_size(event);
perf_event__id_header_size(event);
+ event->owner = current;
+
perf_install_in_context(ctx, event, event->cpu);
perf_unpin_context(ctx);
@@ -8488,8 +8589,6 @@ SYSCALL_DEFINE5(perf_event_open,
put_online_cpus();
- event->owner = current;
-
mutex_lock(&current->perf_event_mutex);
list_add_tail(&event->owner_entry, &current->perf_event_list);
mutex_unlock(&current->perf_event_mutex);
@@ -8514,7 +8613,12 @@ err_context:
perf_unpin_context(ctx);
put_ctx(ctx);
err_alloc:
- free_event(event);
+ /*
+ * If event_file is set, the fput() above will have called ->release()
+ * and that will take care of freeing the event.
+ */
+ if (!event_file)
+ free_event(event);
err_cpus:
put_online_cpus();
err_task:
@@ -8556,7 +8660,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
}
/* Mark owner so we could distinguish it from user events. */
- event->owner = EVENT_OWNER_KERNEL;
+ event->owner = TASK_TOMBSTONE;
account_event(event);
@@ -8568,12 +8672,14 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
WARN_ON_ONCE(ctx->parent_ctx);
mutex_lock(&ctx->mutex);
+ if (ctx->task == TASK_TOMBSTONE) {
+ err = -ESRCH;
+ goto err_unlock;
+ }
+
if (!exclusive_event_installable(event, ctx)) {
- mutex_unlock(&ctx->mutex);
- perf_unpin_context(ctx);
- put_ctx(ctx);
err = -EBUSY;
- goto err_free;
+ goto err_unlock;
}
perf_install_in_context(ctx, event, cpu);
@@ -8582,6 +8688,10 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
return event;
+err_unlock:
+ mutex_unlock(&ctx->mutex);
+ perf_unpin_context(ctx);
+ put_ctx(ctx);
err_free:
free_event(event);
err:
@@ -8606,7 +8716,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
mutex_lock_double(&src_ctx->mutex, &dst_ctx->mutex);
list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
event_entry) {
- perf_remove_from_context(event, false);
+ perf_remove_from_context(event, 0);
unaccount_event_cpu(event, src_cpu);
put_ctx(src_ctx);
list_add(&event->migrate_entry, &events);
@@ -8673,33 +8783,15 @@ static void sync_child_event(struct perf_event *child_event,
&parent_event->child_total_time_enabled);
atomic64_add(child_event->total_time_running,
&parent_event->child_total_time_running);
-
- /*
- * Remove this event from the parent's list
- */
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
- list_del_init(&child_event->child_list);
- mutex_unlock(&parent_event->child_mutex);
-
- /*
- * Make sure user/parent get notified, that we just
- * lost one event.
- */
- perf_event_wakeup(parent_event);
-
- /*
- * Release the parent event, if this was the last
- * reference to it.
- */
- put_event(parent_event);
}
static void
-__perf_event_exit_task(struct perf_event *child_event,
- struct perf_event_context *child_ctx,
- struct task_struct *child)
+perf_event_exit_event(struct perf_event *child_event,
+ struct perf_event_context *child_ctx,
+ struct task_struct *child)
{
+ struct perf_event *parent_event = child_event->parent;
+
/*
* Do not destroy the 'original' grouping; because of the context
* switch optimization the original events could've ended up in a
@@ -8712,57 +8804,86 @@ __perf_event_exit_task(struct perf_event *child_event,
* Do destroy all inherited groups, we don't care about those
* and being thorough is better.
*/
- perf_remove_from_context(child_event, !!child_event->parent);
+ raw_spin_lock_irq(&child_ctx->lock);
+ WARN_ON_ONCE(child_ctx->is_active);
+
+ if (parent_event)
+ perf_group_detach(child_event);
+ list_del_event(child_event, child_ctx);
+ child_event->state = PERF_EVENT_STATE_EXIT; /* is_event_hup() */
+ raw_spin_unlock_irq(&child_ctx->lock);
/*
- * It can happen that the parent exits first, and has events
- * that are still around due to the child reference. These
- * events need to be zapped.
+ * Parent events are governed by their filedesc, retain them.
*/
- if (child_event->parent) {
- sync_child_event(child_event, child);
- free_event(child_event);
- } else {
- child_event->state = PERF_EVENT_STATE_EXIT;
+ if (!parent_event) {
perf_event_wakeup(child_event);
+ return;
}
+ /*
+ * Child events can be cleaned up.
+ */
+
+ sync_child_event(child_event, child);
+
+ /*
+ * Remove this event from the parent's list
+ */
+ WARN_ON_ONCE(parent_event->ctx->parent_ctx);
+ mutex_lock(&parent_event->child_mutex);
+ list_del_init(&child_event->child_list);
+ mutex_unlock(&parent_event->child_mutex);
+
+ /*
+ * Kick perf_poll() for is_event_hup().
+ */
+ perf_event_wakeup(parent_event);
+ free_event(child_event);
+ put_event(parent_event);
}
static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
{
- struct perf_event *child_event, *next;
struct perf_event_context *child_ctx, *clone_ctx = NULL;
- unsigned long flags;
+ struct perf_event *child_event, *next;
- if (likely(!child->perf_event_ctxp[ctxn]))
+ WARN_ON_ONCE(child != current);
+
+ child_ctx = perf_pin_task_context(child, ctxn);
+ if (!child_ctx)
return;
- local_irq_save(flags);
/*
- * We can't reschedule here because interrupts are disabled,
- * and either child is current or it is a task that can't be
- * scheduled, so we are now safe from rescheduling changing
- * our context.
+ * In order to reduce the amount of tricky in ctx tear-down, we hold
+ * ctx::mutex over the entire thing. This serializes against almost
+ * everything that wants to access the ctx.
+ *
+ * The exception is sys_perf_event_open() /
+ * perf_event_create_kernel_count() which does find_get_context()
+ * without ctx::mutex (it cannot because of the move_group double mutex
+ * lock thing). See the comments in perf_install_in_context().
*/
- child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
+ mutex_lock(&child_ctx->mutex);
/*
- * Take the context lock here so that if find_get_context is
- * reading child->perf_event_ctxp, we wait until it has
- * incremented the context's refcount before we do put_ctx below.
+ * In a single ctx::lock section, de-schedule the events and detach the
+ * context from the task such that we cannot ever get it scheduled back
+ * in.
*/
- raw_spin_lock(&child_ctx->lock);
- task_ctx_sched_out(child_ctx);
- child->perf_event_ctxp[ctxn] = NULL;
+ raw_spin_lock_irq(&child_ctx->lock);
+ task_ctx_sched_out(__get_cpu_context(child_ctx), child_ctx);
/*
- * If this context is a clone; unclone it so it can't get
- * swapped to another process while we're removing all
- * the events from it.
+ * Now that the context is inactive, destroy the task <-> ctx relation
+ * and mark the context dead.
*/
+ RCU_INIT_POINTER(child->perf_event_ctxp[ctxn], NULL);
+ put_ctx(child_ctx); /* cannot be last */
+ WRITE_ONCE(child_ctx->task, TASK_TOMBSTONE);
+ put_task_struct(current); /* cannot be last */
+
clone_ctx = unclone_ctx(child_ctx);
- update_context_time(child_ctx);
- raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
+ raw_spin_unlock_irq(&child_ctx->lock);
if (clone_ctx)
put_ctx(clone_ctx);
@@ -8774,20 +8895,8 @@ static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
*/
perf_event_task(child, child_ctx, 0);
- /*
- * We can recurse on the same lock type through:
- *
- * __perf_event_exit_task()
- * sync_child_event()
- * put_event()
- * mutex_lock(&ctx->mutex)
- *
- * But since its the parent context it won't be the same instance.
- */
- mutex_lock(&child_ctx->mutex);
-
list_for_each_entry_safe(child_event, next, &child_ctx->event_list, event_entry)
- __perf_event_exit_task(child_event, child_ctx, child);
+ perf_event_exit_event(child_event, child_ctx, child);
mutex_unlock(&child_ctx->mutex);
@@ -8812,8 +8921,7 @@ void perf_event_exit_task(struct task_struct *child)
* the owner, closes a race against perf_release() where
* we need to serialize on the owner->perf_event_mutex.
*/
- smp_wmb();
- event->owner = NULL;
+ smp_store_release(&event->owner, NULL);
}
mutex_unlock(&child->perf_event_mutex);
@@ -8896,21 +9004,20 @@ void perf_event_delayed_put(struct task_struct *task)
WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
}
-struct perf_event *perf_event_get(unsigned int fd)
+struct file *perf_event_get(unsigned int fd)
{
- int err;
- struct fd f;
- struct perf_event *event;
+ struct file *file;
- err = perf_fget_light(fd, &f);
- if (err)
- return ERR_PTR(err);
+ file = fget_raw(fd);
+ if (!file)
+ return ERR_PTR(-EBADF);
- event = f.file->private_data;
- atomic_long_inc(&event->refcount);
- fdput(f);
+ if (file->f_op != &perf_fops) {
+ fput(file);
+ return ERR_PTR(-EBADF);
+ }
- return event;
+ return file;
}
const struct perf_event_attr *perf_event_attrs(struct perf_event *event)
@@ -8953,8 +9060,16 @@ inherit_event(struct perf_event *parent_event,
if (IS_ERR(child_event))
return child_event;
+ /*
+ * is_orphaned_event() and list_add_tail(&parent_event->child_list)
+ * must be under the same lock in order to serialize against
+ * perf_event_release_kernel(), such that either we must observe
+ * is_orphaned_event() or they will observe us on the child_list.
+ */
+ mutex_lock(&parent_event->child_mutex);
if (is_orphaned_event(parent_event) ||
!atomic_long_inc_not_zero(&parent_event->refcount)) {
+ mutex_unlock(&parent_event->child_mutex);
free_event(child_event);
return NULL;
}
@@ -9002,8 +9117,6 @@ inherit_event(struct perf_event *parent_event,
/*
* Link this into the parent event's child list
*/
- WARN_ON_ONCE(parent_event->ctx->parent_ctx);
- mutex_lock(&parent_event->child_mutex);
list_add_tail(&child_event->child_list, &parent_event->child_list);
mutex_unlock(&parent_event->child_mutex);
@@ -9208,7 +9321,7 @@ static void perf_event_init_cpu(int cpu)
struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
mutex_lock(&swhash->hlist_mutex);
- if (swhash->hlist_refcount > 0) {
+ if (swhash->hlist_refcount > 0 && !swevent_hlist_deref(swhash)) {
struct swevent_hlist *hlist;
hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
@@ -9221,13 +9334,14 @@ static void perf_event_init_cpu(int cpu)
#if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC_CORE
static void __perf_event_exit_context(void *__info)
{
- struct remove_event re = { .detach_group = true };
struct perf_event_context *ctx = __info;
+ struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
+ struct perf_event *event;
- rcu_read_lock();
- list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
- __perf_remove_from_context(&re);
- rcu_read_unlock();
+ raw_spin_lock(&ctx->lock);
+ list_for_each_entry(event, &ctx->event_list, event_entry)
+ __perf_remove_from_context(event, cpuctx, ctx, (void *)DETACH_GROUP);
+ raw_spin_unlock(&ctx->lock);
}
static void perf_event_exit_cpu_context(int cpu)
@@ -9283,11 +9397,9 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
switch (action & ~CPU_TASKS_FROZEN) {
case CPU_UP_PREPARE:
- case CPU_DOWN_FAILED:
perf_event_init_cpu(cpu);
break;
- case CPU_UP_CANCELED:
case CPU_DOWN_PREPARE:
perf_event_exit_cpu(cpu);
break;
@@ -9316,9 +9428,6 @@ void __init perf_event_init(void)
ret = init_hw_breakpoint();
WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
- /* do not patch jump label more than once per second */
- jump_label_rate_limit(&perf_sched_events, HZ);
-
/*
* Build time assertion that we keep the data_head at the intended
* location. IOW, validation we got the __reserved[] size right.
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 92ce5f4ccc26..3f8cb1e14588 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -444,7 +444,7 @@ int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *att
* current task.
*/
if (irqs_disabled() && bp->ctx && bp->ctx->task == current)
- __perf_event_disable(bp);
+ perf_event_disable_local(bp);
else
perf_event_disable(bp);
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index adfdc0536117..1faad2cfdb9e 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -459,6 +459,25 @@ static void rb_free_aux_page(struct ring_buffer *rb, int idx)
__free_page(page);
}
+static void __rb_free_aux(struct ring_buffer *rb)
+{
+ int pg;
+
+ if (rb->aux_priv) {
+ rb->free_aux(rb->aux_priv);
+ rb->free_aux = NULL;
+ rb->aux_priv = NULL;
+ }
+
+ if (rb->aux_nr_pages) {
+ for (pg = 0; pg < rb->aux_nr_pages; pg++)
+ rb_free_aux_page(rb, pg);
+
+ kfree(rb->aux_pages);
+ rb->aux_nr_pages = 0;
+ }
+}
+
int rb_alloc_aux(struct ring_buffer *rb, struct perf_event *event,
pgoff_t pgoff, int nr_pages, long watermark, int flags)
{
@@ -547,30 +566,11 @@ out:
if (!ret)
rb->aux_pgoff = pgoff;
else
- rb_free_aux(rb);
+ __rb_free_aux(rb);
return ret;
}
-static void __rb_free_aux(struct ring_buffer *rb)
-{
- int pg;
-
- if (rb->aux_priv) {
- rb->free_aux(rb->aux_priv);
- rb->free_aux = NULL;
- rb->aux_priv = NULL;
- }
-
- if (rb->aux_nr_pages) {
- for (pg = 0; pg < rb->aux_nr_pages; pg++)
- rb_free_aux_page(rb, pg);
-
- kfree(rb->aux_pages);
- rb->aux_nr_pages = 0;
- }
-}
-
void rb_free_aux(struct ring_buffer *rb)
{
if (atomic_dec_and_test(&rb->aux_refcount))
diff --git a/kernel/futex.c b/kernel/futex.c
index 0773f2b23b10..5d6ce6413ef1 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1191,7 +1191,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (pi_state->owner != current)
return -EINVAL;
- raw_spin_lock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
@@ -1217,22 +1217,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else if (curval != uval)
ret = -EINVAL;
if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
}
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
- raw_spin_lock_irq(&new_owner->pi_lock);
+ raw_spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
- raw_spin_unlock_irq(&new_owner->pi_lock);
+ raw_spin_unlock(&new_owner->pi_lock);
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
@@ -2127,11 +2127,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* we returned due to timeout or signal without taking the
* rt_mutex. Too late.
*/
- raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
if (!owner)
owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a302cf9a2126..57bff7857e87 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -138,7 +138,8 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
unsigned int flags = 0, irq = desc->irq_data.irq;
struct irqaction *action = desc->action;
- do {
+ /* action might have become NULL since we dropped the lock */
+ while (action) {
irqreturn_t res;
trace_irq_handler_entry(irq, action);
@@ -173,7 +174,7 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc)
retval |= res;
action = action->next;
- } while (action);
+ }
add_interrupt_randomness(irq, flags);
diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index 6e655f7acd3b..3e56d2f03e24 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -575,10 +575,15 @@ unsigned int irq_create_fwspec_mapping(struct irq_fwspec *fwspec)
unsigned int type = IRQ_TYPE_NONE;
int virq;
- if (fwspec->fwnode)
- domain = irq_find_matching_fwnode(fwspec->fwnode, DOMAIN_BUS_ANY);
- else
+ if (fwspec->fwnode) {
+ domain = irq_find_matching_fwnode(fwspec->fwnode,
+ DOMAIN_BUS_WIRED);
+ if (!domain)
+ domain = irq_find_matching_fwnode(fwspec->fwnode,
+ DOMAIN_BUS_ANY);
+ } else {
domain = irq_default_domain;
+ }
if (!domain) {
pr_warn("no irq domain found for %s !\n",
diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c
index 60ace56618f6..716547fdb873 100644
--- a/kernel/locking/lockdep.c
+++ b/kernel/locking/lockdep.c
@@ -292,7 +292,7 @@ LIST_HEAD(all_lock_classes);
#define __classhashfn(key) hash_long((unsigned long)key, CLASSHASH_BITS)
#define classhashentry(key) (classhash_table + __classhashfn((key)))
-static struct list_head classhash_table[CLASSHASH_SIZE];
+static struct hlist_head classhash_table[CLASSHASH_SIZE];
/*
* We put the lock dependency chains into a hash-table as well, to cache
@@ -303,7 +303,7 @@ static struct list_head classhash_table[CLASSHASH_SIZE];
#define __chainhashfn(chain) hash_long(chain, CHAINHASH_BITS)
#define chainhashentry(chain) (chainhash_table + __chainhashfn((chain)))
-static struct list_head chainhash_table[CHAINHASH_SIZE];
+static struct hlist_head chainhash_table[CHAINHASH_SIZE];
/*
* The hash key of the lock dependency chains is a hash itself too:
@@ -666,7 +666,7 @@ static inline struct lock_class *
look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
#ifdef CONFIG_DEBUG_LOCKDEP
@@ -719,7 +719,7 @@ look_up_lock_class(struct lockdep_map *lock, unsigned int subclass)
if (DEBUG_LOCKS_WARN_ON(!irqs_disabled()))
return NULL;
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key) {
/*
* Huh! same key, different name? Did someone trample
@@ -742,7 +742,7 @@ static inline struct lock_class *
register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
{
struct lockdep_subclass_key *key;
- struct list_head *hash_head;
+ struct hlist_head *hash_head;
struct lock_class *class;
DEBUG_LOCKS_WARN_ON(!irqs_disabled());
@@ -774,7 +774,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We have to do the hash-walk again, to avoid races
* with another CPU:
*/
- list_for_each_entry_rcu(class, hash_head, hash_entry) {
+ hlist_for_each_entry_rcu(class, hash_head, hash_entry) {
if (class->key == key)
goto out_unlock_set;
}
@@ -805,7 +805,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
* We use RCU's safe list-add method to make
* parallel walking of the hash-list safe:
*/
- list_add_tail_rcu(&class->hash_entry, hash_head);
+ hlist_add_head_rcu(&class->hash_entry, hash_head);
/*
* Add it to the global list of classes:
*/
@@ -1822,7 +1822,7 @@ check_deadlock(struct task_struct *curr, struct held_lock *next,
*/
static int
check_prev_add(struct task_struct *curr, struct held_lock *prev,
- struct held_lock *next, int distance, int trylock_loop)
+ struct held_lock *next, int distance, int *stack_saved)
{
struct lock_list *entry;
int ret;
@@ -1883,8 +1883,11 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
}
}
- if (!trylock_loop && !save_trace(&trace))
- return 0;
+ if (!*stack_saved) {
+ if (!save_trace(&trace))
+ return 0;
+ *stack_saved = 1;
+ }
/*
* Ok, all validations passed, add the new lock
@@ -1907,6 +1910,8 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev,
* Debugging printouts:
*/
if (verbose(hlock_class(prev)) || verbose(hlock_class(next))) {
+ /* We drop graph lock, so another thread can overwrite trace. */
+ *stack_saved = 0;
graph_unlock();
printk("\n new dependency: ");
print_lock_name(hlock_class(prev));
@@ -1929,7 +1934,7 @@ static int
check_prevs_add(struct task_struct *curr, struct held_lock *next)
{
int depth = curr->lockdep_depth;
- int trylock_loop = 0;
+ int stack_saved = 0;
struct held_lock *hlock;
/*
@@ -1956,7 +1961,7 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
*/
if (hlock->read != 2 && hlock->check) {
if (!check_prev_add(curr, hlock, next,
- distance, trylock_loop))
+ distance, &stack_saved))
return 0;
/*
* Stop after the first non-trylock entry,
@@ -1979,7 +1984,6 @@ check_prevs_add(struct task_struct *curr, struct held_lock *next)
if (curr->held_locks[depth].irq_context !=
curr->held_locks[depth-1].irq_context)
break;
- trylock_loop = 1;
}
return 1;
out_bug:
@@ -2017,7 +2021,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
u64 chain_key)
{
struct lock_class *class = hlock_class(hlock);
- struct list_head *hash_head = chainhashentry(chain_key);
+ struct hlist_head *hash_head = chainhashentry(chain_key);
struct lock_chain *chain;
struct held_lock *hlock_curr;
int i, j;
@@ -2033,7 +2037,7 @@ static inline int lookup_chain_cache(struct task_struct *curr,
* We can walk it lock-free, because entries only get added
* to the hash:
*/
- list_for_each_entry_rcu(chain, hash_head, entry) {
+ hlist_for_each_entry_rcu(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
cache_hit:
debug_atomic_inc(chain_lookup_hits);
@@ -2057,7 +2061,7 @@ cache_hit:
/*
* We have to walk the chain again locked - to avoid duplicates:
*/
- list_for_each_entry(chain, hash_head, entry) {
+ hlist_for_each_entry(chain, hash_head, entry) {
if (chain->chain_key == chain_key) {
graph_unlock();
goto cache_hit;
@@ -2091,7 +2095,7 @@ cache_hit:
}
chain_hlocks[chain->base + j] = class - lock_classes;
}
- list_add_tail_rcu(&chain->entry, hash_head);
+ hlist_add_head_rcu(&chain->entry, hash_head);
debug_atomic_inc(chain_lookup_misses);
inc_chains();
@@ -3875,7 +3879,7 @@ void lockdep_reset(void)
nr_process_chains = 0;
debug_locks = 1;
for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+ INIT_HLIST_HEAD(chainhash_table + i);
raw_local_irq_restore(flags);
}
@@ -3894,7 +3898,7 @@ static void zap_class(struct lock_class *class)
/*
* Unhash the class and remove it from the all_lock_classes list:
*/
- list_del_rcu(&class->hash_entry);
+ hlist_del_rcu(&class->hash_entry);
list_del_rcu(&class->lock_entry);
RCU_INIT_POINTER(class->key, NULL);
@@ -3917,7 +3921,7 @@ static inline int within(const void *addr, void *start, unsigned long size)
void lockdep_free_key_range(void *start, unsigned long size)
{
struct lock_class *class;
- struct list_head *head;
+ struct hlist_head *head;
unsigned long flags;
int i;
int locked;
@@ -3930,9 +3934,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
*/
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
if (within(class->key, start, size))
zap_class(class);
else if (within(class->name, start, size))
@@ -3962,7 +3964,7 @@ void lockdep_free_key_range(void *start, unsigned long size)
void lockdep_reset_lock(struct lockdep_map *lock)
{
struct lock_class *class;
- struct list_head *head;
+ struct hlist_head *head;
unsigned long flags;
int i, j;
int locked;
@@ -3987,9 +3989,7 @@ void lockdep_reset_lock(struct lockdep_map *lock)
locked = graph_lock();
for (i = 0; i < CLASSHASH_SIZE; i++) {
head = classhash_table + i;
- if (list_empty(head))
- continue;
- list_for_each_entry_rcu(class, head, hash_entry) {
+ hlist_for_each_entry_rcu(class, head, hash_entry) {
int match = 0;
for (j = 0; j < NR_LOCKDEP_CACHING_CLASSES; j++)
@@ -4027,10 +4027,10 @@ void lockdep_init(void)
return;
for (i = 0; i < CLASSHASH_SIZE; i++)
- INIT_LIST_HEAD(classhash_table + i);
+ INIT_HLIST_HEAD(classhash_table + i);
for (i = 0; i < CHAINHASH_SIZE; i++)
- INIT_LIST_HEAD(chainhash_table + i);
+ INIT_HLIST_HEAD(chainhash_table + i);
lockdep_initialized = 1;
}
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
index 8251e75dd9c0..3e746607abe5 100644
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -99,13 +99,14 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
* 2) Drop lock->wait_lock
* 3) Try to unlock the lock with cmpxchg
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
struct task_struct *owner = rt_mutex_owner(lock);
clear_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/*
* If a new waiter comes in between the unlock and the cmpxchg
* we have two situations:
@@ -147,11 +148,12 @@ static inline void mark_rt_mutex_waiters(struct rt_mutex *lock)
/*
* Simple slow path only version: lock->owner is protected by lock->wait_lock.
*/
-static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock)
+static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock,
+ unsigned long flags)
__releases(lock->wait_lock)
{
lock->owner = NULL;
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return true;
}
#endif
@@ -433,7 +435,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
int ret = 0, depth = 0;
struct rt_mutex *lock;
bool detect_deadlock;
- unsigned long flags;
bool requeue = true;
detect_deadlock = rt_mutex_cond_detect_deadlock(orig_waiter, chwalk);
@@ -476,7 +477,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* [1] Task cannot go away as we did a get_task() before !
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock_irq(&task->pi_lock);
/*
* [2] Get the waiter on which @task is blocked on.
@@ -560,7 +561,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* operations.
*/
if (!raw_spin_trylock(&lock->wait_lock)) {
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
cpu_relax();
goto retry;
}
@@ -591,7 +592,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* No requeue[7] here. Just release @task [8]
*/
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -599,14 +600,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
* If there is no owner of the lock, end of chain.
*/
if (!rt_mutex_owner(lock)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/*
* No requeue [11] here. We just do deadlock detection.
@@ -621,8 +622,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/* If owner is not blocked, end of chain. */
if (!next_lock)
@@ -643,7 +644,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
rt_mutex_enqueue(lock, waiter);
/* [8] Release the task */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
put_task_struct(task);
/*
@@ -661,14 +662,14 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
*/
if (prerequeue_top_waiter != rt_mutex_top_waiter(lock))
wake_up_process(rt_mutex_top_waiter(lock)->task);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 0;
}
/* [10] Grab the next task, i.e. the owner of @lock */
task = rt_mutex_owner(lock);
get_task_struct(task);
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
/* [11] requeue the pi waiters if necessary */
if (waiter == rt_mutex_top_waiter(lock)) {
@@ -722,8 +723,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
top_waiter = rt_mutex_top_waiter(lock);
/* [13] Drop the locks */
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock(&task->pi_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
/*
* Make the actual exit decisions [12], based on the stored
@@ -746,7 +747,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
goto again;
out_unlock_pi:
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock_irq(&task->pi_lock);
out_put_task:
put_task_struct(task);
@@ -756,7 +757,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
/*
* Try to take an rt-mutex
*
- * Must be called with lock->wait_lock held.
+ * Must be called with lock->wait_lock held and interrupts disabled
*
* @lock: The lock to be acquired.
* @task: The task which wants to acquire the lock
@@ -766,8 +767,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
{
- unsigned long flags;
-
/*
* Before testing whether we can acquire @lock, we set the
* RT_MUTEX_HAS_WAITERS bit in @lock->owner. This forces all
@@ -852,7 +851,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
* case, but conditionals are more expensive than a redundant
* store.
*/
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
task->pi_blocked_on = NULL;
/*
* Finish the lock acquisition. @task is the new owner. If
@@ -861,7 +860,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
*/
if (rt_mutex_has_waiters(lock))
rt_mutex_enqueue_pi(task, rt_mutex_top_waiter(lock));
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
takeit:
/* We got the lock. */
@@ -883,7 +882,7 @@ takeit:
*
* Prepare waiter and propagate pi chain
*
- * This must be called with lock->wait_lock held.
+ * This must be called with lock->wait_lock held and interrupts disabled
*/
static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
@@ -894,7 +893,6 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
struct rt_mutex_waiter *top_waiter = waiter;
struct rt_mutex *next_lock;
int chain_walk = 0, res;
- unsigned long flags;
/*
* Early deadlock detection. We really don't want the task to
@@ -908,7 +906,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
if (owner == task)
return -EDEADLK;
- raw_spin_lock_irqsave(&task->pi_lock, flags);
+ raw_spin_lock(&task->pi_lock);
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
@@ -921,12 +919,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
task->pi_blocked_on = waiter;
- raw_spin_unlock_irqrestore(&task->pi_lock, flags);
+ raw_spin_unlock(&task->pi_lock);
if (!owner)
return 0;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
if (waiter == rt_mutex_top_waiter(lock)) {
rt_mutex_dequeue_pi(owner, top_waiter);
rt_mutex_enqueue_pi(owner, waiter);
@@ -941,7 +939,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Even if full deadlock detection is on, if the owner is not
* blocked itself, we can avoid finding this out in the chain
@@ -957,12 +955,12 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
*/
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
res = rt_mutex_adjust_prio_chain(owner, chwalk, lock,
next_lock, waiter, task);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
return res;
}
@@ -971,15 +969,14 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
* Remove the top waiter from the current tasks pi waiter tree and
* queue it up.
*
- * Called with lock->wait_lock held.
+ * Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
waiter = rt_mutex_top_waiter(lock);
@@ -1001,7 +998,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
*/
lock->owner = (void *) RT_MUTEX_HAS_WAITERS;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
wake_q_add(wake_q, waiter->task);
}
@@ -1009,7 +1006,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
/*
* Remove a waiter from a lock and give up
*
- * Must be called with lock->wait_lock held and
+ * Must be called with lock->wait_lock held and interrupts disabled. I must
* have just failed to try_to_take_rt_mutex().
*/
static void remove_waiter(struct rt_mutex *lock,
@@ -1018,12 +1015,11 @@ static void remove_waiter(struct rt_mutex *lock,
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
struct rt_mutex *next_lock;
- unsigned long flags;
- raw_spin_lock_irqsave(&current->pi_lock, flags);
+ raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
current->pi_blocked_on = NULL;
- raw_spin_unlock_irqrestore(&current->pi_lock, flags);
+ raw_spin_unlock(&current->pi_lock);
/*
* Only update priority if the waiter was the highest priority
@@ -1032,7 +1028,7 @@ static void remove_waiter(struct rt_mutex *lock,
if (!owner || !is_top_waiter)
return;
- raw_spin_lock_irqsave(&owner->pi_lock, flags);
+ raw_spin_lock(&owner->pi_lock);
rt_mutex_dequeue_pi(owner, waiter);
@@ -1044,7 +1040,7 @@ static void remove_waiter(struct rt_mutex *lock,
/* Store the lock on which owner is blocked or NULL */
next_lock = task_blocked_on_lock(owner);
- raw_spin_unlock_irqrestore(&owner->pi_lock, flags);
+ raw_spin_unlock(&owner->pi_lock);
/*
* Don't walk the chain, if the owner task is not blocked
@@ -1056,12 +1052,12 @@ static void remove_waiter(struct rt_mutex *lock,
/* gets dropped in rt_mutex_adjust_prio_chain()! */
get_task_struct(owner);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
rt_mutex_adjust_prio_chain(owner, RT_MUTEX_MIN_CHAINWALK, lock,
next_lock, NULL, current);
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
}
/*
@@ -1097,11 +1093,11 @@ void rt_mutex_adjust_pi(struct task_struct *task)
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
* @state: the state the task should block in (TASK_INTERRUPTIBLE
- * or TASK_UNINTERRUPTIBLE)
+ * or TASK_UNINTERRUPTIBLE)
* @timeout: the pre-initialized and started timer, or NULL for none
* @waiter: the pre-initialized rt_mutex_waiter
*
- * lock->wait_lock must be held by the caller.
+ * Must be called with lock->wait_lock held and interrupts disabled
*/
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
@@ -1129,13 +1125,13 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state,
break;
}
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
schedule();
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(state);
}
@@ -1172,17 +1168,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk)
{
struct rt_mutex_waiter waiter;
+ unsigned long flags;
int ret = 0;
debug_rt_mutex_init_waiter(&waiter);
RB_CLEAR_NODE(&waiter.pi_tree_entry);
RB_CLEAR_NODE(&waiter.tree_entry);
- raw_spin_lock(&lock->wait_lock);
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+ * be called in early boot if the cmpxchg() fast path is disabled
+ * (debug, no architecture support). In this case we will acquire the
+ * rtmutex with lock->wait_lock held. But we cannot unconditionally
+ * enable interrupts in that early boot case. So we need to use the
+ * irqsave/restore variants.
+ */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return 0;
}
@@ -1211,7 +1216,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* Remove pending timer: */
if (unlikely(timeout))
@@ -1227,6 +1232,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
*/
static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
{
+ unsigned long flags;
int ret;
/*
@@ -1238,10 +1244,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
return 0;
/*
- * The mutex has currently no owner. Lock the wait lock and
- * try to acquire the lock.
+ * The mutex has currently no owner. Lock the wait lock and try to
+ * acquire the lock. We use irqsave here to support early boot calls.
*/
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
ret = try_to_take_rt_mutex(lock, current, NULL);
@@ -1251,7 +1257,7 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return ret;
}
@@ -1263,7 +1269,10 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock)
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
struct wake_q_head *wake_q)
{
- raw_spin_lock(&lock->wait_lock);
+ unsigned long flags;
+
+ /* irqsave required to support early boot calls */
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
debug_rt_mutex_unlock(lock);
@@ -1302,10 +1311,10 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
while (!rt_mutex_has_waiters(lock)) {
/* Drops lock->wait_lock ! */
- if (unlock_rt_mutex_safe(lock) == true)
+ if (unlock_rt_mutex_safe(lock, flags) == true)
return false;
/* Relock the rtmutex and try again */
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
}
/*
@@ -1316,7 +1325,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
*/
mark_wakeup_next_waiter(wake_q, lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
/* check PI boosting */
return true;
@@ -1596,10 +1605,10 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return 1;
}
@@ -1620,7 +1629,7 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (unlikely(ret))
remove_waiter(lock, waiter);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
@@ -1668,7 +1677,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
{
int ret;
- raw_spin_lock(&lock->wait_lock);
+ raw_spin_lock_irq(&lock->wait_lock);
set_current_state(TASK_INTERRUPTIBLE);
@@ -1684,7 +1693,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
*/
fixup_rt_mutex_waiters(lock);
- raw_spin_unlock(&lock->wait_lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
return ret;
}
diff --git a/kernel/memremap.c b/kernel/memremap.c
index 293309cac061..97b31c774274 100644
--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -115,7 +115,7 @@ EXPORT_SYMBOL(memunmap);
static void devm_memremap_release(struct device *dev, void *res)
{
- memunmap(res);
+ memunmap(*(void **)res);
}
static int devm_memremap_match(struct device *dev, void *res, void *match_data)
@@ -137,8 +137,10 @@ void *devm_memremap(struct device *dev, resource_size_t offset,
if (addr) {
*ptr = addr;
devres_add(dev, ptr);
- } else
+ } else {
devres_free(ptr);
+ return ERR_PTR(-ENXIO);
+ }
return addr;
}
@@ -151,7 +153,7 @@ void devm_memunmap(struct device *dev, void *addr)
}
EXPORT_SYMBOL(devm_memunmap);
-pfn_t phys_to_pfn_t(dma_addr_t addr, unsigned long flags)
+pfn_t phys_to_pfn_t(phys_addr_t addr, u64 flags)
{
return __pfn_to_pfn_t(addr >> PAGE_SHIFT, flags);
}
@@ -184,7 +186,11 @@ EXPORT_SYMBOL(put_zone_device_page);
static void pgmap_radix_release(struct resource *res)
{
- resource_size_t key;
+ resource_size_t key, align_start, align_size, align_end;
+
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ align_end = align_start + align_size - 1;
mutex_lock(&pgmap_lock);
for (key = res->start; key <= res->end; key += SECTION_SIZE)
@@ -227,12 +233,11 @@ static void devm_memremap_pages_release(struct device *dev, void *data)
percpu_ref_put(pgmap->ref);
}
- pgmap_radix_release(res);
-
/* pages are dead and unused, undo the arch mapping */
align_start = res->start & ~(SECTION_SIZE - 1);
align_size = ALIGN(resource_size(res), SECTION_SIZE);
arch_remove_memory(align_start, align_size);
+ pgmap_radix_release(res);
dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc,
"%s: failed to free all reserved pages\n", __func__);
}
@@ -268,7 +273,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
{
int is_ram = region_intersects(res->start, resource_size(res),
IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE);
- resource_size_t key, align_start, align_size;
+ resource_size_t key, align_start, align_size, align_end;
struct dev_pagemap *pgmap;
struct page_map *page_map;
unsigned long pfn;
@@ -310,7 +315,10 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
mutex_lock(&pgmap_lock);
error = 0;
- for (key = res->start; key <= res->end; key += SECTION_SIZE) {
+ align_start = res->start & ~(SECTION_SIZE - 1);
+ align_size = ALIGN(resource_size(res), SECTION_SIZE);
+ align_end = align_start + align_size - 1;
+ for (key = align_start; key <= align_end; key += SECTION_SIZE) {
struct dev_pagemap *dup;
rcu_read_lock();
@@ -337,8 +345,6 @@ void *devm_memremap_pages(struct device *dev, struct resource *res,
if (nid < 0)
nid = numa_mem_id();
- align_start = res->start & ~(SECTION_SIZE - 1);
- align_size = ALIGN(resource_size(res), SECTION_SIZE);
error = arch_add_memory(nid, align_start, align_size, true);
if (error)
goto err_add_memory;
diff --git a/kernel/module.c b/kernel/module.c
index 8358f4697c0c..794ebe8e878d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -303,6 +303,9 @@ struct load_info {
struct _ddebug *debug;
unsigned int num_debug;
bool sig_ok;
+#ifdef CONFIG_KALLSYMS
+ unsigned long mod_kallsyms_init_off;
+#endif
struct {
unsigned int sym, str, mod, vers, info, pcpu;
} index;
@@ -981,6 +984,8 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
mod->exit();
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
+ ftrace_release_mod(mod);
+
async_synchronize_full();
/* Store the name of the last unloaded module for diagnostic purposes */
@@ -2480,10 +2485,21 @@ static void layout_symtab(struct module *mod, struct load_info *info)
strsect->sh_flags |= SHF_ALLOC;
strsect->sh_entsize = get_offset(mod, &mod->init_layout.size, strsect,
info->index.str) | INIT_OFFSET_MASK;
- mod->init_layout.size = debug_align(mod->init_layout.size);
pr_debug("\t%s\n", info->secstrings + strsect->sh_name);
+
+ /* We'll tack temporary mod_kallsyms on the end. */
+ mod->init_layout.size = ALIGN(mod->init_layout.size,
+ __alignof__(struct mod_kallsyms));
+ info->mod_kallsyms_init_off = mod->init_layout.size;
+ mod->init_layout.size += sizeof(struct mod_kallsyms);
+ mod->init_layout.size = debug_align(mod->init_layout.size);
}
+/*
+ * We use the full symtab and strtab which layout_symtab arranged to
+ * be appended to the init section. Later we switch to the cut-down
+ * core-only ones.
+ */
static void add_kallsyms(struct module *mod, const struct load_info *info)
{
unsigned int i, ndst;
@@ -2492,29 +2508,34 @@ static void add_kallsyms(struct module *mod, const struct load_info *info)
char *s;
Elf_Shdr *symsec = &info->sechdrs[info->index.sym];
- mod->symtab = (void *)symsec->sh_addr;
- mod->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
+ /* Set up to point into init section. */
+ mod->kallsyms = mod->init_layout.base + info->mod_kallsyms_init_off;
+
+ mod->kallsyms->symtab = (void *)symsec->sh_addr;
+ mod->kallsyms->num_symtab = symsec->sh_size / sizeof(Elf_Sym);
/* Make sure we get permanent strtab: don't use info->strtab. */
- mod->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
+ mod->kallsyms->strtab = (void *)info->sechdrs[info->index.str].sh_addr;
/* Set types up while we still have access to sections. */
- for (i = 0; i < mod->num_symtab; i++)
- mod->symtab[i].st_info = elf_type(&mod->symtab[i], info);
-
- mod->core_symtab = dst = mod->core_layout.base + info->symoffs;
- mod->core_strtab = s = mod->core_layout.base + info->stroffs;
- src = mod->symtab;
- for (ndst = i = 0; i < mod->num_symtab; i++) {
+ for (i = 0; i < mod->kallsyms->num_symtab; i++)
+ mod->kallsyms->symtab[i].st_info
+ = elf_type(&mod->kallsyms->symtab[i], info);
+
+ /* Now populate the cut down core kallsyms for after init. */
+ mod->core_kallsyms.symtab = dst = mod->core_layout.base + info->symoffs;
+ mod->core_kallsyms.strtab = s = mod->core_layout.base + info->stroffs;
+ src = mod->kallsyms->symtab;
+ for (ndst = i = 0; i < mod->kallsyms->num_symtab; i++) {
if (i == 0 ||
is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum,
info->index.pcpu)) {
dst[ndst] = src[i];
- dst[ndst++].st_name = s - mod->core_strtab;
- s += strlcpy(s, &mod->strtab[src[i].st_name],
+ dst[ndst++].st_name = s - mod->core_kallsyms.strtab;
+ s += strlcpy(s, &mod->kallsyms->strtab[src[i].st_name],
KSYM_NAME_LEN) + 1;
}
}
- mod->core_num_syms = ndst;
+ mod->core_kallsyms.num_symtab = ndst;
}
#else
static inline void layout_symtab(struct module *mod, struct load_info *info)
@@ -3263,9 +3284,8 @@ static noinline int do_init_module(struct module *mod)
module_put(mod);
trim_init_extable(mod);
#ifdef CONFIG_KALLSYMS
- mod->num_symtab = mod->core_num_syms;
- mod->symtab = mod->core_symtab;
- mod->strtab = mod->core_strtab;
+ /* Switch to core kallsyms now init is done: kallsyms may be walking! */
+ rcu_assign_pointer(mod->kallsyms, &mod->core_kallsyms);
#endif
mod_tree_remove_init(mod);
disable_ro_nx(&mod->init_layout);
@@ -3295,6 +3315,7 @@ fail:
module_put(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_GOING, mod);
+ ftrace_release_mod(mod);
free_module(mod);
wake_up_all(&module_wq);
return ret;
@@ -3371,6 +3392,7 @@ static int complete_formation(struct module *mod, struct load_info *info)
mod->state = MODULE_STATE_COMING;
mutex_unlock(&module_mutex);
+ ftrace_module_enable(mod);
blocking_notifier_call_chain(&module_notify_list,
MODULE_STATE_COMING, mod);
return 0;
@@ -3496,7 +3518,7 @@ static int load_module(struct load_info *info, const char __user *uargs,
/* Module is ready to execute: parsing args may do that. */
after_dashes = parse_args(mod->name, mod->args, mod->kp, mod->num_kp,
- -32768, 32767, NULL,
+ -32768, 32767, mod,
unknown_module_param_cb);
if (IS_ERR(after_dashes)) {
err = PTR_ERR(after_dashes);
@@ -3627,6 +3649,11 @@ static inline int is_arm_mapping_symbol(const char *str)
&& (str[2] == '\0' || str[2] == '.');
}
+static const char *symname(struct mod_kallsyms *kallsyms, unsigned int symnum)
+{
+ return kallsyms->strtab + kallsyms->symtab[symnum].st_name;
+}
+
static const char *get_ksymbol(struct module *mod,
unsigned long addr,
unsigned long *size,
@@ -3634,6 +3661,7 @@ static const char *get_ksymbol(struct module *mod,
{
unsigned int i, best = 0;
unsigned long nextval;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
/* At worse, next value is at end of module */
if (within_module_init(addr, mod))
@@ -3643,32 +3671,32 @@ static const char *get_ksymbol(struct module *mod,
/* Scan for closest preceding symbol, and next symbol. (ELF
starts real symbols at 1). */
- for (i = 1; i < mod->num_symtab; i++) {
- if (mod->symtab[i].st_shndx == SHN_UNDEF)
+ for (i = 1; i < kallsyms->num_symtab; i++) {
+ if (kallsyms->symtab[i].st_shndx == SHN_UNDEF)
continue;
/* We ignore unnamed symbols: they're uninformative
* and inserted at a whim. */
- if (mod->symtab[i].st_value <= addr
- && mod->symtab[i].st_value > mod->symtab[best].st_value
- && *(mod->strtab + mod->symtab[i].st_name) != '\0'
- && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
+ if (*symname(kallsyms, i) == '\0'
+ || is_arm_mapping_symbol(symname(kallsyms, i)))
+ continue;
+
+ if (kallsyms->symtab[i].st_value <= addr
+ && kallsyms->symtab[i].st_value > kallsyms->symtab[best].st_value)
best = i;
- if (mod->symtab[i].st_value > addr
- && mod->symtab[i].st_value < nextval
- && *(mod->strtab + mod->symtab[i].st_name) != '\0'
- && !is_arm_mapping_symbol(mod->strtab + mod->symtab[i].st_name))
- nextval = mod->symtab[i].st_value;
+ if (kallsyms->symtab[i].st_value > addr
+ && kallsyms->symtab[i].st_value < nextval)
+ nextval = kallsyms->symtab[i].st_value;
}
if (!best)
return NULL;
if (size)
- *size = nextval - mod->symtab[best].st_value;
+ *size = nextval - kallsyms->symtab[best].st_value;
if (offset)
- *offset = addr - mod->symtab[best].st_value;
- return mod->strtab + mod->symtab[best].st_name;
+ *offset = addr - kallsyms->symtab[best].st_value;
+ return symname(kallsyms, best);
}
/* For kallsyms to ask for address resolution. NULL means not found. Careful
@@ -3758,19 +3786,21 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
preempt_disable();
list_for_each_entry_rcu(mod, &modules, list) {
+ struct mod_kallsyms *kallsyms;
+
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- if (symnum < mod->num_symtab) {
- *value = mod->symtab[symnum].st_value;
- *type = mod->symtab[symnum].st_info;
- strlcpy(name, mod->strtab + mod->symtab[symnum].st_name,
- KSYM_NAME_LEN);
+ kallsyms = rcu_dereference_sched(mod->kallsyms);
+ if (symnum < kallsyms->num_symtab) {
+ *value = kallsyms->symtab[symnum].st_value;
+ *type = kallsyms->symtab[symnum].st_info;
+ strlcpy(name, symname(kallsyms, symnum), KSYM_NAME_LEN);
strlcpy(module_name, mod->name, MODULE_NAME_LEN);
*exported = is_exported(name, *value, mod);
preempt_enable();
return 0;
}
- symnum -= mod->num_symtab;
+ symnum -= kallsyms->num_symtab;
}
preempt_enable();
return -ERANGE;
@@ -3779,11 +3809,12 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
static unsigned long mod_find_symname(struct module *mod, const char *name)
{
unsigned int i;
+ struct mod_kallsyms *kallsyms = rcu_dereference_sched(mod->kallsyms);
- for (i = 0; i < mod->num_symtab; i++)
- if (strcmp(name, mod->strtab+mod->symtab[i].st_name) == 0 &&
- mod->symtab[i].st_info != 'U')
- return mod->symtab[i].st_value;
+ for (i = 0; i < kallsyms->num_symtab; i++)
+ if (strcmp(name, symname(kallsyms, i)) == 0 &&
+ kallsyms->symtab[i].st_info != 'U')
+ return kallsyms->symtab[i].st_value;
return 0;
}
@@ -3822,11 +3853,14 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
module_assert_mutex();
list_for_each_entry(mod, &modules, list) {
+ /* We hold module_mutex: no need for rcu_dereference_sched */
+ struct mod_kallsyms *kallsyms = mod->kallsyms;
+
if (mod->state == MODULE_STATE_UNFORMED)
continue;
- for (i = 0; i < mod->num_symtab; i++) {
- ret = fn(data, mod->strtab + mod->symtab[i].st_name,
- mod, mod->symtab[i].st_value);
+ for (i = 0; i < kallsyms->num_symtab; i++) {
+ ret = fn(data, symname(kallsyms, i),
+ mod, kallsyms->symtab[i].st_value);
if (ret != 0)
return ret;
}
diff --git a/kernel/pid.c b/kernel/pid.c
index f4ad91b746f1..4d73a834c7e6 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -588,7 +588,7 @@ void __init pidhash_init(void)
void __init pidmap_init(void)
{
- /* Veryify no one has done anything silly */
+ /* Verify no one has done anything silly: */
BUILD_BUG_ON(PID_MAX_LIMIT >= PIDNS_HASH_ADDING);
/* bump default and minimum pid_max based on number of cpus */
diff --git a/kernel/resource.c b/kernel/resource.c
index 49834309043c..4d466052426b 100644
--- a/kernel/resource.c
+++ b/kernel/resource.c
@@ -1098,9 +1098,10 @@ struct resource * __request_region(struct resource *parent,
if (!conflict)
break;
if (conflict != parent) {
- parent = conflict;
- if (!(conflict->flags & IORESOURCE_BUSY))
+ if (!(conflict->flags & IORESOURCE_BUSY)) {
+ parent = conflict;
continue;
+ }
}
if (conflict->flags & flags & IORESOURCE_MUXED) {
add_wait_queue(&muxed_resource_wait, &wait);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 63d3a24e081a..9503d590e5ef 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6840,7 +6840,7 @@ static void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;
- for (k = 0; k < nr_node_ids; k++) {
+ for_each_node(k) {
if (node_distance(j, k) > sched_domains_numa_distance[i])
continue;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index cd64c979d0e1..57b939c81bce 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -420,7 +420,7 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se,
* entity.
*/
if (dl_time_before(dl_se->deadline, rq_clock(rq))) {
- printk_deferred_once("sched: DL replenish lagged to much\n");
+ printk_deferred_once("sched: DL replenish lagged too much\n");
dl_se->deadline = rq_clock(rq) + pi_se->dl_deadline;
dl_se->runtime = pi_se->dl_runtime;
}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 1926606ece80..56b7d4b83947 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1220,8 +1220,6 @@ static void task_numa_assign(struct task_numa_env *env,
{
if (env->best_task)
put_task_struct(env->best_task);
- if (p)
- get_task_struct(p);
env->best_task = p;
env->best_imp = imp;
@@ -1289,20 +1287,30 @@ static void task_numa_compare(struct task_numa_env *env,
long imp = env->p->numa_group ? groupimp : taskimp;
long moveimp = imp;
int dist = env->dist;
+ bool assigned = false;
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
- * No need to move the exiting task, and this ensures that ->curr
- * wasn't reaped and thus get_task_struct() in task_numa_assign()
- * is safe under RCU read lock.
- * Note that rcu_read_lock() itself can't protect from the final
- * put_task_struct() after the last schedule().
+ * No need to move the exiting task or idle task.
*/
if ((cur->flags & PF_EXITING) || is_idle_task(cur))
cur = NULL;
+ else {
+ /*
+ * The task_struct must be protected here to protect the
+ * p->numa_faults access in the task_weight since the
+ * numa_faults could already be freed in the following path:
+ * finish_task_switch()
+ * --> put_task_struct()
+ * --> __put_task_struct()
+ * --> task_numa_free()
+ */
+ get_task_struct(cur);
+ }
+
raw_spin_unlock_irq(&dst_rq->lock);
/*
@@ -1386,6 +1394,7 @@ balance:
*/
if (!load_too_imbalanced(src_load, dst_load, env)) {
imp = moveimp - 1;
+ put_task_struct(cur);
cur = NULL;
goto assign;
}
@@ -1411,9 +1420,16 @@ balance:
env->dst_cpu = select_idle_sibling(env->p, env->dst_cpu);
assign:
+ assigned = true;
task_numa_assign(env, cur, imp);
unlock:
rcu_read_unlock();
+ /*
+ * The dst_rq->curr isn't assigned. The protection for task_struct is
+ * finished.
+ */
+ if (cur && !assigned)
+ put_task_struct(cur);
}
static void task_numa_find_cpu(struct task_numa_env *env,
diff --git a/kernel/signal.c b/kernel/signal.c
index f3f1f7a972fd..0508544c8ced 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -3508,8 +3508,10 @@ static int sigsuspend(sigset_t *set)
current->saved_sigmask = current->blocked;
set_current_blocked(set);
- __set_current_state(TASK_INTERRUPTIBLE);
- schedule();
+ while (!signal_pending(current)) {
+ __set_current_state(TASK_INTERRUPTIBLE);
+ schedule();
+ }
set_restore_sigmask();
return -ERESTARTNOHAND;
}
diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c
index 435b8850dd80..fa909f9fd559 100644
--- a/kernel/time/hrtimer.c
+++ b/kernel/time/hrtimer.c
@@ -897,10 +897,10 @@ static int enqueue_hrtimer(struct hrtimer *timer,
*/
static void __remove_hrtimer(struct hrtimer *timer,
struct hrtimer_clock_base *base,
- unsigned long newstate, int reprogram)
+ u8 newstate, int reprogram)
{
struct hrtimer_cpu_base *cpu_base = base->cpu_base;
- unsigned int state = timer->state;
+ u8 state = timer->state;
timer->state = newstate;
if (!(state & HRTIMER_STATE_ENQUEUED))
@@ -930,7 +930,7 @@ static inline int
remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool restart)
{
if (hrtimer_is_queued(timer)) {
- unsigned long state = timer->state;
+ u8 state = timer->state;
int reprogram;
/*
@@ -954,6 +954,22 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base, bool rest
return 0;
}
+static inline ktime_t hrtimer_update_lowres(struct hrtimer *timer, ktime_t tim,
+ const enum hrtimer_mode mode)
+{
+#ifdef CONFIG_TIME_LOW_RES
+ /*
+ * CONFIG_TIME_LOW_RES indicates that the system has no way to return
+ * granular time values. For relative timers we add hrtimer_resolution
+ * (i.e. one jiffie) to prevent short timeouts.
+ */
+ timer->is_rel = mode & HRTIMER_MODE_REL;
+ if (timer->is_rel)
+ tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
+#endif
+ return tim;
+}
+
/**
* hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
* @timer: the timer to be added
@@ -974,19 +990,10 @@ void hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
/* Remove an active timer from the queue: */
remove_hrtimer(timer, base, true);
- if (mode & HRTIMER_MODE_REL) {
+ if (mode & HRTIMER_MODE_REL)
tim = ktime_add_safe(tim, base->get_time());
- /*
- * CONFIG_TIME_LOW_RES is a temporary way for architectures
- * to signal that they simply return xtime in
- * do_gettimeoffset(). In this case we want to round up by
- * resolution when starting a relative timer, to avoid short
- * timeouts. This will go away with the GTOD framework.
- */
-#ifdef CONFIG_TIME_LOW_RES
- tim = ktime_add_safe(tim, ktime_set(0, hrtimer_resolution));
-#endif
- }
+
+ tim = hrtimer_update_lowres(timer, tim, mode);
hrtimer_set_expires_range_ns(timer, tim, delta_ns);
@@ -1074,19 +1081,23 @@ EXPORT_SYMBOL_GPL(hrtimer_cancel);
/**
* hrtimer_get_remaining - get remaining time for the timer
* @timer: the timer to read
+ * @adjust: adjust relative timers when CONFIG_TIME_LOW_RES=y
*/
-ktime_t hrtimer_get_remaining(const struct hrtimer *timer)
+ktime_t __hrtimer_get_remaining(const struct hrtimer *timer, bool adjust)
{
unsigned long flags;
ktime_t rem;
lock_hrtimer_base(timer, &flags);
- rem = hrtimer_expires_remaining(timer);
+ if (IS_ENABLED(CONFIG_TIME_LOW_RES) && adjust)
+ rem = hrtimer_expires_remaining_adjusted(timer);
+ else
+ rem = hrtimer_expires_remaining(timer);
unlock_hrtimer_base(timer, &flags);
return rem;
}
-EXPORT_SYMBOL_GPL(hrtimer_get_remaining);
+EXPORT_SYMBOL_GPL(__hrtimer_get_remaining);
#ifdef CONFIG_NO_HZ_COMMON
/**
@@ -1220,6 +1231,14 @@ static void __run_hrtimer(struct hrtimer_cpu_base *cpu_base,
fn = timer->function;
/*
+ * Clear the 'is relative' flag for the TIME_LOW_RES case. If the
+ * timer is restarted with a period then it becomes an absolute
+ * timer. If its not restarted it does not matter.
+ */
+ if (IS_ENABLED(CONFIG_TIME_LOW_RES))
+ timer->is_rel = false;
+
+ /*
* Because we run timers from hardirq context, there is no chance
* they get migrated to another cpu, therefore its safe to unlock
* the timer base.
diff --git a/kernel/time/itimer.c b/kernel/time/itimer.c
index 8d262b467573..1d5c7204ddc9 100644
--- a/kernel/time/itimer.c
+++ b/kernel/time/itimer.c
@@ -26,7 +26,7 @@
*/
static struct timeval itimer_get_remtime(struct hrtimer *timer)
{
- ktime_t rem = hrtimer_get_remaining(timer);
+ ktime_t rem = __hrtimer_get_remaining(timer, true);
/*
* Racy but safe: if the itimer expires after the above
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 36f2ca09aa5e..6df8927c58a5 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -685,8 +685,18 @@ int ntp_validate_timex(struct timex *txc)
if (!capable(CAP_SYS_TIME))
return -EPERM;
- if (!timeval_inject_offset_valid(&txc->time))
- return -EINVAL;
+ if (txc->modes & ADJ_NANO) {
+ struct timespec ts;
+
+ ts.tv_sec = txc->time.tv_sec;
+ ts.tv_nsec = txc->time.tv_usec;
+ if (!timespec_inject_offset_valid(&ts))
+ return -EINVAL;
+
+ } else {
+ if (!timeval_inject_offset_valid(&txc->time))
+ return -EINVAL;
+ }
}
/*
diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c
index 31d11ac9fa47..f2826c35e918 100644
--- a/kernel/time/posix-timers.c
+++ b/kernel/time/posix-timers.c
@@ -760,7 +760,7 @@ common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
(timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE))
timr->it_overrun += (unsigned int) hrtimer_forward(timer, now, iv);
- remaining = ktime_sub(hrtimer_get_expires(timer), now);
+ remaining = __hrtimer_expires_remaining_adjusted(timer, now);
/* Return 0 only, when the timer is expired and not pending */
if (remaining.tv64 <= 0) {
/*
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 9d7a053545f5..0b17424349eb 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -36,16 +36,17 @@
*/
static DEFINE_PER_CPU(struct tick_sched, tick_cpu_sched);
-/*
- * The time, when the last jiffy update happened. Protected by jiffies_lock.
- */
-static ktime_t last_jiffies_update;
-
struct tick_sched *tick_get_tick_sched(int cpu)
{
return &per_cpu(tick_cpu_sched, cpu);
}
+#if defined(CONFIG_NO_HZ_COMMON) || defined(CONFIG_HIGH_RES_TIMERS)
+/*
+ * The time, when the last jiffy update happened. Protected by jiffies_lock.
+ */
+static ktime_t last_jiffies_update;
+
/*
* Must be called with interrupts disabled !
*/
@@ -151,6 +152,7 @@ static void tick_sched_handle(struct tick_sched *ts, struct pt_regs *regs)
update_process_times(user_mode(regs));
profile_tick(CPU_PROFILING);
}
+#endif
#ifdef CONFIG_NO_HZ_FULL
cpumask_var_t tick_nohz_full_mask;
@@ -993,9 +995,9 @@ static void tick_nohz_switch_to_nohz(void)
/* Get the next period */
next = tick_init_jiffy_update();
- hrtimer_forward_now(&ts->sched_timer, tick_period);
hrtimer_set_expires(&ts->sched_timer, next);
- tick_program_event(next, 1);
+ hrtimer_forward_now(&ts->sched_timer, tick_period);
+ tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
}
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index f75e35b60149..ba7d8b288bb3 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -69,7 +69,7 @@ print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer,
print_name_offset(m, taddr);
SEQ_printf(m, ", ");
print_name_offset(m, timer->function);
- SEQ_printf(m, ", S:%02lx", timer->state);
+ SEQ_printf(m, ", S:%02x", timer->state);
#ifdef CONFIG_TIMER_STATS
SEQ_printf(m, ", ");
print_name_offset(m, timer->start_site);
diff --git a/kernel/trace/bpf_trace.c b/kernel/trace/bpf_trace.c
index 45dd798bcd37..326a75e884db 100644
--- a/kernel/trace/bpf_trace.c
+++ b/kernel/trace/bpf_trace.c
@@ -191,14 +191,17 @@ static u64 bpf_perf_event_read(u64 r1, u64 index, u64 r3, u64 r4, u64 r5)
struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
struct bpf_array *array = container_of(map, struct bpf_array, map);
struct perf_event *event;
+ struct file *file;
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (!event)
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
/* make sure event is local and doesn't have pmu::count */
if (event->oncpu != smp_processor_id() ||
event->pmu->count)
@@ -228,6 +231,7 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
void *data = (void *) (long) r4;
struct perf_sample_data sample_data;
struct perf_event *event;
+ struct file *file;
struct perf_raw_record raw = {
.size = size,
.data = data,
@@ -236,10 +240,12 @@ static u64 bpf_perf_event_output(u64 r1, u64 r2, u64 index, u64 r4, u64 size)
if (unlikely(index >= array->map.max_entries))
return -E2BIG;
- event = (struct perf_event *)array->ptrs[index];
- if (unlikely(!event))
+ file = (struct file *)array->ptrs[index];
+ if (unlikely(!file))
return -ENOENT;
+ event = file->private_data;
+
if (unlikely(event->attr.type != PERF_TYPE_SOFTWARE ||
event->attr.config != PERF_COUNT_SW_BPF_OUTPUT))
return -EINVAL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index eca592f977b2..57a6eea84694 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -4961,7 +4961,7 @@ void ftrace_release_mod(struct module *mod)
mutex_unlock(&ftrace_lock);
}
-static void ftrace_module_enable(struct module *mod)
+void ftrace_module_enable(struct module *mod)
{
struct dyn_ftrace *rec;
struct ftrace_page *pg;
@@ -5038,38 +5038,8 @@ void ftrace_module_init(struct module *mod)
ftrace_process_locs(mod, mod->ftrace_callsites,
mod->ftrace_callsites + mod->num_ftrace_callsites);
}
-
-static int ftrace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
-{
- struct module *mod = data;
-
- switch (val) {
- case MODULE_STATE_COMING:
- ftrace_module_enable(mod);
- break;
- case MODULE_STATE_GOING:
- ftrace_release_mod(mod);
- break;
- default:
- break;
- }
-
- return 0;
-}
-#else
-static int ftrace_module_notify(struct notifier_block *self,
- unsigned long val, void *data)
-{
- return 0;
-}
#endif /* CONFIG_MODULES */
-struct notifier_block ftrace_module_nb = {
- .notifier_call = ftrace_module_notify,
- .priority = INT_MIN, /* Run after anything that can remove kprobes */
-};
-
void __init ftrace_init(void)
{
extern unsigned long __start_mcount_loc[];
@@ -5098,10 +5068,6 @@ void __init ftrace_init(void)
__start_mcount_loc,
__stop_mcount_loc);
- ret = register_module_notifier(&ftrace_module_nb);
- if (ret)
- pr_warning("Failed to register trace ftrace module exit notifier\n");
-
set_ftrace_early_filters();
return;
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index f333e57c4614..ab09829d3b97 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -869,7 +869,8 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
* The ftrace subsystem is for showing formats only.
* They can not be enabled or disabled via the event files.
*/
- if (call->class && call->class->reg)
+ if (call->class && call->class->reg &&
+ !(call->flags & TRACE_EVENT_FL_IGNORE_ENABLE))
return file;
}
diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index dda9e6742950..2a1abbaca10e 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -126,6 +126,13 @@ check_stack(unsigned long ip, unsigned long *stack)
}
/*
+ * Some archs may not have the passed in ip in the dump.
+ * If that happens, we need to show everything.
+ */
+ if (i == stack_trace_max.nr_entries)
+ i = 0;
+
+ /*
* Now find where in the stack these are.
*/
x = 0;
@@ -149,7 +156,11 @@ check_stack(unsigned long ip, unsigned long *stack)
for (; p < top && i < stack_trace_max.nr_entries; p++) {
if (stack_dump_trace[i] == ULONG_MAX)
break;
- if (*p == stack_dump_trace[i]) {
+ /*
+ * The READ_ONCE_NOCHECK is used to let KASAN know that
+ * this is not a stack-out-of-bounds error.
+ */
+ if ((READ_ONCE_NOCHECK(*p)) == stack_dump_trace[i]) {
stack_dump_trace[x] = stack_dump_trace[i++];
this_size = stack_trace_index[x++] =
(top - p) * sizeof(unsigned long);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 61a0264e28f9..7ff5dc7d2ac5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -301,7 +301,23 @@ static DEFINE_SPINLOCK(wq_mayday_lock); /* protects wq->maydays list */
static LIST_HEAD(workqueues); /* PR: list of all workqueues */
static bool workqueue_freezing; /* PL: have wqs started freezing? */
-static cpumask_var_t wq_unbound_cpumask; /* PL: low level cpumask for all unbound wqs */
+/* PL: allowable cpus for unbound wqs and work items */
+static cpumask_var_t wq_unbound_cpumask;
+
+/* CPU where unbound work was last round robin scheduled from this CPU */
+static DEFINE_PER_CPU(int, wq_rr_cpu_last);
+
+/*
+ * Local execution of unbound work items is no longer guaranteed. The
+ * following always forces round-robin CPU selection on unbound work items
+ * to uncover usages which depend on it.
+ */
+#ifdef CONFIG_DEBUG_WQ_FORCE_RR_CPU
+static bool wq_debug_force_rr_cpu = true;
+#else
+static bool wq_debug_force_rr_cpu = false;
+#endif
+module_param_named(debug_force_rr_cpu, wq_debug_force_rr_cpu, bool, 0644);
/* the per-cpu worker pools */
static DEFINE_PER_CPU_SHARED_ALIGNED(struct worker_pool [NR_STD_WORKER_POOLS],
@@ -570,6 +586,16 @@ static struct pool_workqueue *unbound_pwq_by_node(struct workqueue_struct *wq,
int node)
{
assert_rcu_or_wq_mutex_or_pool_mutex(wq);
+
+ /*
+ * XXX: @node can be NUMA_NO_NODE if CPU goes offline while a
+ * delayed item is pending. The plan is to keep CPU -> NODE
+ * mapping valid and stable across CPU on/offlines. Once that
+ * happens, this workaround can be removed.
+ */
+ if (unlikely(node == NUMA_NO_NODE))
+ return wq->dfl_pwq;
+
return rcu_dereference_raw(wq->numa_pwq_tbl[node]);
}
@@ -1298,6 +1324,39 @@ static bool is_chained_work(struct workqueue_struct *wq)
return worker && worker->current_pwq->wq == wq;
}
+/*
+ * When queueing an unbound work item to a wq, prefer local CPU if allowed
+ * by wq_unbound_cpumask. Otherwise, round robin among the allowed ones to
+ * avoid perturbing sensitive tasks.
+ */
+static int wq_select_unbound_cpu(int cpu)
+{
+ static bool printed_dbg_warning;
+ int new_cpu;
+
+ if (likely(!wq_debug_force_rr_cpu)) {
+ if (cpumask_test_cpu(cpu, wq_unbound_cpumask))
+ return cpu;
+ } else if (!printed_dbg_warning) {
+ pr_warn("workqueue: round-robin CPU selection forced, expect performance impact\n");
+ printed_dbg_warning = true;
+ }
+
+ if (cpumask_empty(wq_unbound_cpumask))
+ return cpu;
+
+ new_cpu = __this_cpu_read(wq_rr_cpu_last);
+ new_cpu = cpumask_next_and(new_cpu, wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids)) {
+ new_cpu = cpumask_first_and(wq_unbound_cpumask, cpu_online_mask);
+ if (unlikely(new_cpu >= nr_cpu_ids))
+ return cpu;
+ }
+ __this_cpu_write(wq_rr_cpu_last, new_cpu);
+
+ return new_cpu;
+}
+
static void __queue_work(int cpu, struct workqueue_struct *wq,
struct work_struct *work)
{
@@ -1323,7 +1382,7 @@ static void __queue_work(int cpu, struct workqueue_struct *wq,
return;
retry:
if (req_cpu == WORK_CPU_UNBOUND)
- cpu = raw_smp_processor_id();
+ cpu = wq_select_unbound_cpu(raw_smp_processor_id());
/* pwq which will be used unless @work is executing elsewhere */
if (!(wq->flags & WQ_UNBOUND))
@@ -1464,13 +1523,13 @@ static void __queue_delayed_work(int cpu, struct workqueue_struct *wq,
timer_stats_timer_set_start_info(&dwork->timer);
dwork->wq = wq;
- /* timer isn't guaranteed to run in this cpu, record earlier */
- if (cpu == WORK_CPU_UNBOUND)
- cpu = raw_smp_processor_id();
dwork->cpu = cpu;
timer->expires = jiffies + delay;
- add_timer_on(timer, cpu);
+ if (unlikely(cpu != WORK_CPU_UNBOUND))
+ add_timer_on(timer, cpu);
+ else
+ add_timer(timer);
}
/**
@@ -2355,7 +2414,8 @@ static void check_flush_dependency(struct workqueue_struct *target_wq,
WARN_ONCE(current->flags & PF_MEMALLOC,
"workqueue: PF_MEMALLOC task %d(%s) is flushing !WQ_MEM_RECLAIM %s:%pf",
current->pid, current->comm, target_wq->name, target_func);
- WARN_ONCE(worker && (worker->current_pwq->wq->flags & WQ_MEM_RECLAIM),
+ WARN_ONCE(worker && ((worker->current_pwq->wq->flags &
+ (WQ_MEM_RECLAIM | __WQ_LEGACY)) == WQ_MEM_RECLAIM),
"workqueue: WQ_MEM_RECLAIM %s:%pf is flushing !WQ_MEM_RECLAIM %s:%pf",
worker->current_pwq->wq->name, worker->current_func,
target_wq->name, target_func);