summaryrefslogtreecommitdiff
path: root/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'kernel')
-rw-r--r--kernel/bpf/helpers.c22
-rw-r--r--kernel/bpf/verifier.c9
-rw-r--r--kernel/cfi.c8
-rw-r--r--kernel/cgroup/cpuset.c59
-rw-r--r--kernel/cpu.c84
-rw-r--r--kernel/cred.c12
-rw-r--r--kernel/events/hw_breakpoint.c4
-rw-r--r--kernel/fork.c10
-rw-r--r--kernel/kcsan/debugfs.c2
-rw-r--r--kernel/locking/locktorture.c25
-rw-r--r--kernel/padata.c35
-rw-r--r--kernel/pid.c15
-rw-r--r--kernel/rcu/rcuscale.c4
-rw-r--r--kernel/rcu/rcutorture.c7
-rw-r--r--kernel/rcu/refscale.c36
-rw-r--r--kernel/rcu/srcutiny.c2
-rw-r--r--kernel/rcu/tasks.h36
-rw-r--r--kernel/rcu/tree.c107
-rw-r--r--kernel/rcu/tree_nocb.h1496
-rw-r--r--kernel/rcu/tree_plugin.h1506
-rw-r--r--kernel/rcu/tree_stall.h111
-rw-r--r--kernel/scftorture.c78
-rw-r--r--kernel/sched/core.c594
-rw-r--r--kernel/sched/deadline.c8
-rw-r--r--kernel/sched/debug.c10
-rw-r--r--kernel/sched/fair.c211
-rw-r--r--kernel/sched/sched.h31
-rw-r--r--kernel/sched/topology.c65
-rw-r--r--kernel/smp.c14
-rw-r--r--kernel/smpboot.c8
-rw-r--r--kernel/torture.c6
-rw-r--r--kernel/trace/Kconfig5
-rw-r--r--kernel/trace/ftrace.c4
-rw-r--r--kernel/trace/trace.c18
-rw-r--r--kernel/trace/trace.h32
-rw-r--r--kernel/trace/trace_events_hist.c2
-rw-r--r--kernel/trace/trace_osnoise.c56
37 files changed, 2767 insertions, 1965 deletions
diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
index 7a97b2f4747d..55f83ea09dae 100644
--- a/kernel/bpf/helpers.c
+++ b/kernel/bpf/helpers.c
@@ -353,9 +353,15 @@ const struct bpf_func_proto bpf_jiffies64_proto = {
#ifdef CONFIG_CGROUPS
BPF_CALL_0(bpf_get_current_cgroup_id)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
+ u64 cgrp_id;
- return cgroup_id(cgrp);
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
+ cgrp_id = cgroup_id(cgrp);
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
@@ -366,13 +372,17 @@ const struct bpf_func_proto bpf_get_current_cgroup_id_proto = {
BPF_CALL_1(bpf_get_current_ancestor_cgroup_id, int, ancestor_level)
{
- struct cgroup *cgrp = task_dfl_cgroup(current);
+ struct cgroup *cgrp;
struct cgroup *ancestor;
+ u64 cgrp_id;
+ rcu_read_lock();
+ cgrp = task_dfl_cgroup(current);
ancestor = cgroup_ancestor(cgrp, ancestor_level);
- if (!ancestor)
- return 0;
- return cgroup_id(ancestor);
+ cgrp_id = ancestor ? cgroup_id(ancestor) : 0;
+ rcu_read_unlock();
+
+ return cgrp_id;
}
const struct bpf_func_proto bpf_get_current_ancestor_cgroup_id_proto = {
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index f9bda5476ea5..49f07e2bf23b 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -5150,8 +5150,6 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
case BPF_MAP_TYPE_RINGBUF:
if (func_id != BPF_FUNC_ringbuf_output &&
func_id != BPF_FUNC_ringbuf_reserve &&
- func_id != BPF_FUNC_ringbuf_submit &&
- func_id != BPF_FUNC_ringbuf_discard &&
func_id != BPF_FUNC_ringbuf_query)
goto error;
break;
@@ -5260,6 +5258,12 @@ static int check_map_func_compatibility(struct bpf_verifier_env *env,
if (map->map_type != BPF_MAP_TYPE_PERF_EVENT_ARRAY)
goto error;
break;
+ case BPF_FUNC_ringbuf_output:
+ case BPF_FUNC_ringbuf_reserve:
+ case BPF_FUNC_ringbuf_query:
+ if (map->map_type != BPF_MAP_TYPE_RINGBUF)
+ goto error;
+ break;
case BPF_FUNC_get_stackid:
if (map->map_type != BPF_MAP_TYPE_STACK_TRACE)
goto error;
@@ -11663,6 +11667,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env)
if (aux_data[i].seen)
continue;
memcpy(insn + i, &trap, sizeof(trap));
+ aux_data[i].zext_dst = false;
}
}
diff --git a/kernel/cfi.c b/kernel/cfi.c
index e17a56639766..9594cfd1cf2c 100644
--- a/kernel/cfi.c
+++ b/kernel/cfi.c
@@ -248,9 +248,9 @@ static inline cfi_check_fn find_shadow_check_fn(unsigned long ptr)
{
cfi_check_fn fn;
- rcu_read_lock_sched();
+ rcu_read_lock_sched_notrace();
fn = ptr_to_check_fn(rcu_dereference_sched(cfi_shadow), ptr);
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched_notrace();
return fn;
}
@@ -269,11 +269,11 @@ static inline cfi_check_fn find_module_check_fn(unsigned long ptr)
cfi_check_fn fn = NULL;
struct module *mod;
- rcu_read_lock_sched();
+ rcu_read_lock_sched_notrace();
mod = __module_address(ptr);
if (mod)
fn = mod->cfi_check;
- rcu_read_unlock_sched();
+ rcu_read_unlock_sched_notrace();
return fn;
}
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index adb5190c4429..6500cbe0ce16 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -372,18 +372,29 @@ static inline bool is_in_v2_mode(void)
}
/*
- * Return in pmask the portion of a cpusets's cpus_allowed that
- * are online. If none are online, walk up the cpuset hierarchy
- * until we find one that does have some online cpus.
+ * Return in pmask the portion of a task's cpusets's cpus_allowed that
+ * are online and are capable of running the task. If none are found,
+ * walk up the cpuset hierarchy until we find one that does have some
+ * appropriate cpus.
*
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
* Call with callback_lock or cpuset_mutex held.
*/
-static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
+static void guarantee_online_cpus(struct task_struct *tsk,
+ struct cpumask *pmask)
{
- while (!cpumask_intersects(cs->effective_cpus, cpu_online_mask)) {
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ struct cpuset *cs;
+
+ if (WARN_ON(!cpumask_and(pmask, possible_mask, cpu_online_mask)))
+ cpumask_copy(pmask, cpu_online_mask);
+
+ rcu_read_lock();
+ cs = task_cs(tsk);
+
+ while (!cpumask_intersects(cs->effective_cpus, pmask)) {
cs = parent_cs(cs);
if (unlikely(!cs)) {
/*
@@ -393,11 +404,13 @@ static void guarantee_online_cpus(struct cpuset *cs, struct cpumask *pmask)
* cpuset's effective_cpus is on its way to be
* identical to cpu_online_mask.
*/
- cpumask_copy(pmask, cpu_online_mask);
- return;
+ goto out_unlock;
}
}
- cpumask_and(pmask, cs->effective_cpus, cpu_online_mask);
+ cpumask_and(pmask, pmask, cs->effective_cpus);
+
+out_unlock:
+ rcu_read_unlock();
}
/*
@@ -2199,15 +2212,13 @@ static void cpuset_attach(struct cgroup_taskset *tset)
percpu_down_write(&cpuset_rwsem);
- /* prepare for attach */
- if (cs == &top_cpuset)
- cpumask_copy(cpus_attach, cpu_possible_mask);
- else
- guarantee_online_cpus(cs, cpus_attach);
-
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cgroup_taskset_for_each(task, css, tset) {
+ if (cs != &top_cpuset)
+ guarantee_online_cpus(task, cpus_attach);
+ else
+ cpumask_copy(cpus_attach, task_cpu_possible_mask(task));
/*
* can_attach beforehand should guarantee that this doesn't
* fail. TODO: have a better way to handle failure here
@@ -3302,9 +3313,7 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
unsigned long flags;
spin_lock_irqsave(&callback_lock, flags);
- rcu_read_lock();
- guarantee_online_cpus(task_cs(tsk), pmask);
- rcu_read_unlock();
+ guarantee_online_cpus(tsk, pmask);
spin_unlock_irqrestore(&callback_lock, flags);
}
@@ -3318,13 +3327,22 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask)
* which will not contain a sane cpumask during cases such as cpu hotplugging.
* This is the absolute last resort for the scheduler and it is only used if
* _every_ other avenue has been traveled.
+ *
+ * Returns true if the affinity of @tsk was changed, false otherwise.
**/
-void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
+bool cpuset_cpus_allowed_fallback(struct task_struct *tsk)
{
+ const struct cpumask *possible_mask = task_cpu_possible_mask(tsk);
+ const struct cpumask *cs_mask;
+ bool changed = false;
+
rcu_read_lock();
- do_set_cpus_allowed(tsk, is_in_v2_mode() ?
- task_cs(tsk)->cpus_allowed : cpu_possible_mask);
+ cs_mask = task_cs(tsk)->cpus_allowed;
+ if (is_in_v2_mode() && cpumask_subset(cs_mask, possible_mask)) {
+ do_set_cpus_allowed(tsk, cs_mask);
+ changed = true;
+ }
rcu_read_unlock();
/*
@@ -3344,6 +3362,7 @@ void cpuset_cpus_allowed_fallback(struct task_struct *tsk)
* select_fallback_rq() will fix things ups and set cpu_possible_mask
* if required.
*/
+ return changed;
}
void __init cpuset_init_current_mems_allowed(void)
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 804b847912dc..192e43a87407 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -41,14 +41,19 @@
#include "smpboot.h"
/**
- * cpuhp_cpu_state - Per cpu hotplug state storage
+ * struct cpuhp_cpu_state - Per cpu hotplug state storage
* @state: The current cpu state
* @target: The target state
+ * @fail: Current CPU hotplug callback state
* @thread: Pointer to the hotplug thread
* @should_run: Thread should execute
* @rollback: Perform a rollback
* @single: Single callback invocation
* @bringup: Single callback bringup or teardown selector
+ * @cpu: CPU number
+ * @node: Remote CPU node; for multi-instance, do a
+ * single entry callback for install/remove
+ * @last: For multi-instance rollback, remember how far we got
* @cb_state: The state for a single callback (install/uninstall)
* @result: Result of the operation
* @done_up: Signal completion to the issuer of the task for cpu-up
@@ -106,11 +111,12 @@ static inline void cpuhp_lock_release(bool bringup) { }
#endif
/**
- * cpuhp_step - Hotplug state machine step
+ * struct cpuhp_step - Hotplug state machine step
* @name: Name of the step
* @startup: Startup function of the step
* @teardown: Teardown function of the step
* @cant_stop: Bringup/teardown can't be stopped at this step
+ * @multi_instance: State has multiple instances which get added afterwards
*/
struct cpuhp_step {
const char *name;
@@ -124,7 +130,9 @@ struct cpuhp_step {
int (*multi)(unsigned int cpu,
struct hlist_node *node);
} teardown;
+ /* private: */
struct hlist_head list;
+ /* public: */
bool cant_stop;
bool multi_instance;
};
@@ -143,7 +151,7 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
}
/**
- * cpuhp_invoke_callback _ Invoke the callbacks for a given state
+ * cpuhp_invoke_callback - Invoke the callbacks for a given state
* @cpu: The cpu for which the callback should be invoked
* @state: The state to do callbacks for
* @bringup: True if the bringup callback should be invoked
@@ -151,6 +159,8 @@ static bool cpuhp_step_empty(bool bringup, struct cpuhp_step *step)
* @lastp: For multi-instance rollback, remember how far we got
*
* Called from cpu hotplug and from the state register machinery.
+ *
+ * Return: %0 on success or a negative errno code
*/
static int cpuhp_invoke_callback(unsigned int cpu, enum cpuhp_state state,
bool bringup, struct hlist_node *node,
@@ -682,6 +692,10 @@ static int cpuhp_up_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback_range(true, cpu, st, target);
if (ret) {
+ pr_debug("CPU UP failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
+
cpuhp_reset_state(st, prev_state);
if (can_rollback_cpu(st))
WARN_ON(cpuhp_invoke_callback_range(false, cpu, st,
@@ -1081,6 +1095,9 @@ static int cpuhp_down_callbacks(unsigned int cpu, struct cpuhp_cpu_state *st,
ret = cpuhp_invoke_callback_range(false, cpu, st, target);
if (ret) {
+ pr_debug("CPU DOWN failed (%d) CPU %u state %s (%d)\n",
+ ret, cpu, cpuhp_get_step(st->state)->name,
+ st->state);
cpuhp_reset_state(st, prev_state);
@@ -1183,6 +1200,8 @@ static int cpu_down(unsigned int cpu, enum cpuhp_state target)
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use remove_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
int cpu_device_down(struct device *dev)
{
@@ -1395,6 +1414,8 @@ out:
* This function is meant to be used by device core cpu subsystem only.
*
* Other subsystems should use add_cpu() instead.
+ *
+ * Return: %0 on success or a negative errno code
*/
int cpu_device_up(struct device *dev)
{
@@ -1420,6 +1441,8 @@ EXPORT_SYMBOL_GPL(add_cpu);
* On some architectures like arm64, we can hibernate on any CPU, but on
* wake up the CPU we hibernated on might be offline as a side effect of
* using maxcpus= for example.
+ *
+ * Return: %0 on success or a negative errno code
*/
int bringup_hibernate_cpu(unsigned int sleep_cpu)
{
@@ -1976,6 +1999,7 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
/**
* __cpuhp_setup_state_cpuslocked - Setup the callbacks for an hotplug machine state
* @state: The state to setup
+ * @name: Name of the step
* @invoke: If true, the startup function is invoked for cpus where
* cpu state >= @state
* @startup: startup callback function
@@ -1984,9 +2008,9 @@ EXPORT_SYMBOL_GPL(__cpuhp_state_add_instance);
* added afterwards.
*
* The caller needs to hold cpus read locked while calling this function.
- * Returns:
+ * Return:
* On success:
- * Positive state number if @state is CPUHP_AP_ONLINE_DYN
+ * Positive state number if @state is CPUHP_AP_ONLINE_DYN;
* 0 for all other states
* On failure: proper (negative) error code
*/
@@ -2232,18 +2256,17 @@ int cpuhp_smt_enable(void)
#endif
#if defined(CONFIG_SYSFS) && defined(CONFIG_HOTPLUG_CPU)
-static ssize_t show_cpuhp_state(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t state_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->state);
}
-static DEVICE_ATTR(state, 0444, show_cpuhp_state, NULL);
+static DEVICE_ATTR_RO(state);
-static ssize_t write_cpuhp_target(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t target_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
@@ -2281,19 +2304,17 @@ out:
return ret ? ret : count;
}
-static ssize_t show_cpuhp_target(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t target_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->target);
}
-static DEVICE_ATTR(target, 0644, show_cpuhp_target, write_cpuhp_target);
-
+static DEVICE_ATTR_RW(target);
-static ssize_t write_cpuhp_fail(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t fail_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
struct cpuhp_step *sp;
@@ -2342,15 +2363,15 @@ static ssize_t write_cpuhp_fail(struct device *dev,
return count;
}
-static ssize_t show_cpuhp_fail(struct device *dev,
- struct device_attribute *attr, char *buf)
+static ssize_t fail_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
struct cpuhp_cpu_state *st = per_cpu_ptr(&cpuhp_state, dev->id);
return sprintf(buf, "%d\n", st->fail);
}
-static DEVICE_ATTR(fail, 0644, show_cpuhp_fail, write_cpuhp_fail);
+static DEVICE_ATTR_RW(fail);
static struct attribute *cpuhp_cpu_attrs[] = {
&dev_attr_state.attr,
@@ -2365,7 +2386,7 @@ static const struct attribute_group cpuhp_cpu_attr_group = {
NULL
};
-static ssize_t show_cpuhp_states(struct device *dev,
+static ssize_t states_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
ssize_t cur, res = 0;
@@ -2384,7 +2405,7 @@ static ssize_t show_cpuhp_states(struct device *dev,
mutex_unlock(&cpuhp_state_mutex);
return res;
}
-static DEVICE_ATTR(states, 0444, show_cpuhp_states, NULL);
+static DEVICE_ATTR_RO(states);
static struct attribute *cpuhp_cpu_root_attrs[] = {
&dev_attr_states.attr,
@@ -2457,28 +2478,27 @@ static const char *smt_states[] = {
[CPU_SMT_NOT_IMPLEMENTED] = "notimplemented",
};
-static ssize_t
-show_smt_control(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t control_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
const char *state = smt_states[cpu_smt_control];
return snprintf(buf, PAGE_SIZE - 2, "%s\n", state);
}
-static ssize_t
-store_smt_control(struct device *dev, struct device_attribute *attr,
- const char *buf, size_t count)
+static ssize_t control_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count)
{
return __store_smt_control(dev, attr, buf, count);
}
-static DEVICE_ATTR(control, 0644, show_smt_control, store_smt_control);
+static DEVICE_ATTR_RW(control);
-static ssize_t
-show_smt_active(struct device *dev, struct device_attribute *attr, char *buf)
+static ssize_t active_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
return snprintf(buf, PAGE_SIZE - 2, "%d\n", sched_smt_active());
}
-static DEVICE_ATTR(active, 0444, show_smt_active, NULL);
+static DEVICE_ATTR_RO(active);
static struct attribute *cpuhp_smt_attrs[] = {
&dev_attr_control.attr,
diff --git a/kernel/cred.c b/kernel/cred.c
index e6fd2b3fc31f..f784e08c2fbd 100644
--- a/kernel/cred.c
+++ b/kernel/cred.c
@@ -286,13 +286,13 @@ struct cred *prepare_creds(void)
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
- goto error;
-
new->ucounts = get_ucounts(new->ucounts);
if (!new->ucounts)
goto error;
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
+ goto error;
+
validate_creds(new);
return new;
@@ -753,13 +753,13 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
#ifdef CONFIG_SECURITY
new->security = NULL;
#endif
- if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
- goto error;
-
new->ucounts = get_ucounts(new->ucounts);
if (!new->ucounts)
goto error;
+ if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
+ goto error;
+
put_cred(old);
validate_creds(new);
return new;
diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c
index 835973444a1e..f32320ac02fd 100644
--- a/kernel/events/hw_breakpoint.c
+++ b/kernel/events/hw_breakpoint.c
@@ -568,7 +568,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
if (!cpu_events)
return (void __percpu __force *)ERR_PTR(-ENOMEM);
- get_online_cpus();
+ cpus_read_lock();
for_each_online_cpu(cpu) {
bp = perf_event_create_kernel_counter(attr, cpu, NULL,
triggered, context);
@@ -579,7 +579,7 @@ register_wide_hw_breakpoint(struct perf_event_attr *attr,
per_cpu(*cpu_events, cpu) = bp;
}
- put_online_cpus();
+ cpus_read_unlock();
if (likely(!err))
return cpu_events;
diff --git a/kernel/fork.c b/kernel/fork.c
index bc94b2cc5995..757301cf4d80 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -446,6 +446,7 @@ void put_task_stack(struct task_struct *tsk)
void free_task(struct task_struct *tsk)
{
+ release_user_cpus_ptr(tsk);
scs_release(tsk);
#ifndef CONFIG_THREAD_INFO_IN_TASK
@@ -828,10 +829,10 @@ void __init fork_init(void)
for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2;
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC));
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE));
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING));
- set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK));
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, RLIM_INFINITY);
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, RLIM_INFINITY);
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, RLIM_INFINITY);
+ set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, RLIM_INFINITY);
#ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
@@ -924,6 +925,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node)
#endif
if (orig->cpus_ptr == &orig->cpus_mask)
tsk->cpus_ptr = &tsk->cpus_mask;
+ dup_user_cpus_ptr(tsk, orig, node);
/*
* One for the user space visible state that goes away when reaped.
diff --git a/kernel/kcsan/debugfs.c b/kernel/kcsan/debugfs.c
index e65de172ccf7..1d1d1b0e4248 100644
--- a/kernel/kcsan/debugfs.c
+++ b/kernel/kcsan/debugfs.c
@@ -64,7 +64,7 @@ static noinline void microbenchmark(unsigned long iters)
{
const struct kcsan_ctx ctx_save = current->kcsan_ctx;
const bool was_enabled = READ_ONCE(kcsan_enabled);
- cycles_t cycles;
+ u64 cycles;
/* We may have been called from an atomic region; reset context. */
memset(&current->kcsan_ctx, 0, sizeof(current->kcsan_ctx));
diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c
index b3adb40549bf..7c5a4a087cc7 100644
--- a/kernel/locking/locktorture.c
+++ b/kernel/locking/locktorture.c
@@ -59,7 +59,7 @@ static struct task_struct **writer_tasks;
static struct task_struct **reader_tasks;
static bool lock_is_write_held;
-static bool lock_is_read_held;
+static atomic_t lock_is_read_held;
static unsigned long last_lock_release;
struct lock_stress_stats {
@@ -682,7 +682,7 @@ static int lock_torture_writer(void *arg)
if (WARN_ON_ONCE(lock_is_write_held))
lwsp->n_lock_fail++;
lock_is_write_held = true;
- if (WARN_ON_ONCE(lock_is_read_held))
+ if (WARN_ON_ONCE(atomic_read(&lock_is_read_held)))
lwsp->n_lock_fail++; /* rare, but... */
lwsp->n_lock_acquired++;
@@ -717,13 +717,13 @@ static int lock_torture_reader(void *arg)
schedule_timeout_uninterruptible(1);
cxt.cur_ops->readlock(tid);
- lock_is_read_held = true;
+ atomic_inc(&lock_is_read_held);
if (WARN_ON_ONCE(lock_is_write_held))
lrsp->n_lock_fail++; /* rare, but... */
lrsp->n_lock_acquired++;
cxt.cur_ops->read_delay(&rand);
- lock_is_read_held = false;
+ atomic_dec(&lock_is_read_held);
cxt.cur_ops->readunlock(tid);
stutter_wait("lock_torture_reader");
@@ -738,20 +738,22 @@ static int lock_torture_reader(void *arg)
static void __torture_print_stats(char *page,
struct lock_stress_stats *statp, bool write)
{
+ long cur;
bool fail = false;
int i, n_stress;
- long max = 0, min = statp ? statp[0].n_lock_acquired : 0;
+ long max = 0, min = statp ? data_race(statp[0].n_lock_acquired) : 0;
long long sum = 0;
n_stress = write ? cxt.nrealwriters_stress : cxt.nrealreaders_stress;
for (i = 0; i < n_stress; i++) {
- if (statp[i].n_lock_fail)
+ if (data_race(statp[i].n_lock_fail))
fail = true;
- sum += statp[i].n_lock_acquired;
- if (max < statp[i].n_lock_acquired)
- max = statp[i].n_lock_acquired;
- if (min > statp[i].n_lock_acquired)
- min = statp[i].n_lock_acquired;
+ cur = data_race(statp[i].n_lock_acquired);
+ sum += cur;
+ if (max < cur)
+ max = cur;
+ if (min > cur)
+ min = cur;
}
page += sprintf(page,
"%s: Total: %lld Max/Min: %ld/%ld %s Fail: %d %s\n",
@@ -996,7 +998,6 @@ static int __init lock_torture_init(void)
}
if (nreaders_stress) {
- lock_is_read_held = false;
cxt.lrsa = kmalloc_array(cxt.nrealreaders_stress,
sizeof(*cxt.lrsa),
GFP_KERNEL);
diff --git a/kernel/padata.c b/kernel/padata.c
index d4d3ba6e1728..18d3a5c699d8 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -9,19 +9,6 @@
*
* Copyright (c) 2020 Oracle and/or its affiliates.
* Author: Daniel Jordan <daniel.m.jordan@oracle.com>
- *
- * This program is free software; you can redistribute it and/or modify it
- * under the terms and conditions of the GNU General Public License,
- * version 2, as published by the Free Software Foundation.
- *
- * This program is distributed in the hope it will be useful, but WITHOUT
- * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
- * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
- * more details.
- *
- * You should have received a copy of the GNU General Public License along with
- * this program; if not, write to the Free Software Foundation, Inc.,
- * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
#include <linux/completion.h>
@@ -211,7 +198,7 @@ int padata_do_parallel(struct padata_shell *ps,
if ((pinst->flags & PADATA_RESET))
goto out;
- atomic_inc(&pd->refcnt);
+ refcount_inc(&pd->refcnt);
padata->pd = pd;
padata->cb_cpu = *cb_cpu;
@@ -383,7 +370,7 @@ static void padata_serial_worker(struct work_struct *serial_work)
}
local_bh_enable();
- if (atomic_sub_and_test(cnt, &pd->refcnt))
+ if (refcount_sub_and_test(cnt, &pd->refcnt))
padata_free_pd(pd);
}
@@ -593,7 +580,7 @@ static struct parallel_data *padata_alloc_pd(struct padata_shell *ps)
padata_init_reorder_list(pd);
padata_init_squeues(pd);
pd->seq_nr = -1;
- atomic_set(&pd->refcnt, 1);
+ refcount_set(&pd->refcnt, 1);
spin_lock_init(&pd->lock);
pd->cpu = cpumask_first(pd->cpumask.pcpu);
INIT_WORK(&pd->reorder_work, invoke_padata_reorder);
@@ -667,7 +654,7 @@ static int padata_replace(struct padata_instance *pinst)
synchronize_rcu();
list_for_each_entry_continue_reverse(ps, &pinst->pslist, list)
- if (atomic_dec_and_test(&ps->opd->refcnt))
+ if (refcount_dec_and_test(&ps->opd->refcnt))
padata_free_pd(ps->opd);
pinst->flags &= ~PADATA_RESET;
@@ -733,7 +720,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
struct cpumask *serial_mask, *parallel_mask;
int err = -EINVAL;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&pinst->lock);
switch (cpumask_type) {
@@ -753,7 +740,7 @@ int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
out:
mutex_unlock(&pinst->lock);
- put_online_cpus();
+ cpus_read_unlock();
return err;
}
@@ -992,7 +979,7 @@ struct padata_instance *padata_alloc(const char *name)
if (!pinst->parallel_wq)
goto err_free_inst;
- get_online_cpus();
+ cpus_read_lock();
pinst->serial_wq = alloc_workqueue("%s_serial", WQ_MEM_RECLAIM |
WQ_CPU_INTENSIVE, 1, name);
@@ -1026,7 +1013,7 @@ struct padata_instance *padata_alloc(const char *name)
&pinst->cpu_dead_node);
#endif
- put_online_cpus();
+ cpus_read_unlock();
return pinst;
@@ -1036,7 +1023,7 @@ err_free_masks:
err_free_serial_wq:
destroy_workqueue(pinst->serial_wq);
err_put_cpus:
- put_online_cpus();
+ cpus_read_unlock();
destroy_workqueue(pinst->parallel_wq);
err_free_inst:
kfree(pinst);
@@ -1074,9 +1061,9 @@ struct padata_shell *padata_alloc_shell(struct padata_instance *pinst)
ps->pinst = pinst;
- get_online_cpus();
+ cpus_read_lock();
pd = padata_alloc_pd(ps);
- put_online_cpus();
+ cpus_read_unlock();
if (!pd)
goto out_free_ps;
diff --git a/kernel/pid.c b/kernel/pid.c
index ebdf9c60cd0b..efe87db44683 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -550,13 +550,21 @@ struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags)
* Note, that this function can only be called after the fd table has
* been unshared to avoid leaking the pidfd to the new process.
*
+ * This symbol should not be explicitly exported to loadable modules.
+ *
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
-static int pidfd_create(struct pid *pid, unsigned int flags)
+int pidfd_create(struct pid *pid, unsigned int flags)
{
int fd;
+ if (!pid || !pid_has_task(pid, PIDTYPE_TGID))
+ return -EINVAL;
+
+ if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC))
+ return -EINVAL;
+
fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
flags | O_RDWR | O_CLOEXEC);
if (fd < 0)
@@ -596,10 +604,7 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
if (!p)
return -ESRCH;
- if (pid_has_task(p, PIDTYPE_TGID))
- fd = pidfd_create(p, flags);
- else
- fd = -EINVAL;
+ fd = pidfd_create(p, flags);
put_pid(p);
return fd;
diff --git a/kernel/rcu/rcuscale.c b/kernel/rcu/rcuscale.c
index dca51fe9c73f..2cc34a22a506 100644
--- a/kernel/rcu/rcuscale.c
+++ b/kernel/rcu/rcuscale.c
@@ -487,7 +487,7 @@ retry:
if (gp_async) {
cur_ops->gp_barrier();
}
- writer_n_durations[me] = i_max;
+ writer_n_durations[me] = i_max + 1;
torture_kthread_stopping("rcu_scale_writer");
return 0;
}
@@ -561,7 +561,7 @@ rcu_scale_cleanup(void)
wdpp = writer_durations[i];
if (!wdpp)
continue;
- for (j = 0; j <= writer_n_durations[i]; j++) {
+ for (j = 0; j < writer_n_durations[i]; j++) {
wdp = &wdpp[j];
pr_alert("%s%s %4d writer-duration: %5d %llu\n",
scale_type, SCALE_FLAG,
diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 40ef5417d954..ab4215266ebe 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -2022,8 +2022,13 @@ static int rcu_torture_stall(void *args)
__func__, raw_smp_processor_id());
while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(),
stop_at))
- if (stall_cpu_block)
+ if (stall_cpu_block) {
+#ifdef CONFIG_PREEMPTION
+ preempt_schedule();
+#else
schedule_timeout_uninterruptible(HZ);
+#endif
+ }
if (stall_cpu_irqsoff)
local_irq_enable();
else if (!stall_cpu_block)
diff --git a/kernel/rcu/refscale.c b/kernel/rcu/refscale.c
index d998a76fb542..66dc14cf5687 100644
--- a/kernel/rcu/refscale.c
+++ b/kernel/rcu/refscale.c
@@ -467,6 +467,40 @@ static struct ref_scale_ops acqrel_ops = {
.name = "acqrel"
};
+static volatile u64 stopopts;
+
+static void ref_clock_section(const int nloops)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--)
+ x += ktime_get_real_fast_ns();
+ preempt_enable();
+ stopopts = x;
+}
+
+static void ref_clock_delay_section(const int nloops, const int udl, const int ndl)
+{
+ u64 x = 0;
+ int i;
+
+ preempt_disable();
+ for (i = nloops; i >= 0; i--) {
+ x += ktime_get_real_fast_ns();
+ un_delay(udl, ndl);
+ }
+ preempt_enable();
+ stopopts = x;
+}
+
+static struct ref_scale_ops clock_ops = {
+ .readsection = ref_clock_section,
+ .delaysection = ref_clock_delay_section,
+ .name = "clock"
+};
+
static void rcu_scale_one_reader(void)
{
if (readdelay <= 0)
@@ -759,7 +793,7 @@ ref_scale_init(void)
int firsterr = 0;
static struct ref_scale_ops *scale_ops[] = {
&rcu_ops, &srcu_ops, &rcu_trace_ops, &rcu_tasks_ops, &refcnt_ops, &rwlock_ops,
- &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops,
+ &rwsem_ops, &lock_ops, &lock_irq_ops, &acqrel_ops, &clock_ops,
};
if (!torture_init_begin(scale_type, verbose))
diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c
index 26344dc6483b..a0ba2ed49bc6 100644
--- a/kernel/rcu/srcutiny.c
+++ b/kernel/rcu/srcutiny.c
@@ -96,7 +96,7 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct);
*/
void __srcu_read_unlock(struct srcu_struct *ssp, int idx)
{
- int newval = ssp->srcu_lock_nesting[idx] - 1;
+ int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1;
WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval);
if (!newval && READ_ONCE(ssp->srcu_gp_waiting))
diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h
index 8536c55df514..806160c44b17 100644
--- a/kernel/rcu/tasks.h
+++ b/kernel/rcu/tasks.h
@@ -643,8 +643,8 @@ void exit_tasks_rcu_finish(void) { exit_tasks_rcu_finish_trace(current); }
//
// "Rude" variant of Tasks RCU, inspired by Steve Rostedt's trick of
// passing an empty function to schedule_on_each_cpu(). This approach
-// provides an asynchronous call_rcu_tasks_rude() API and batching
-// of concurrent calls to the synchronous synchronize_rcu_rude() API.
+// provides an asynchronous call_rcu_tasks_rude() API and batching of
+// concurrent calls to the synchronous synchronize_rcu_tasks_rude() API.
// This invokes schedule_on_each_cpu() in order to send IPIs far and wide
// and induces otherwise unnecessary context switches on all online CPUs,
// whether idle or not.
@@ -785,7 +785,10 @@ EXPORT_SYMBOL_GPL(show_rcu_tasks_rude_gp_kthread);
// set that task's .need_qs flag so that task's next outermost
// rcu_read_unlock_trace() will report the quiescent state (in which
// case the count of readers is incremented). If both attempts fail,
-// the task is added to a "holdout" list.
+// the task is added to a "holdout" list. Note that IPIs are used
+// to invoke trc_read_check_handler() in the context of running tasks
+// in order to avoid ordering overhead on common-case shared-variable
+// accessses.
// rcu_tasks_trace_postscan():
// Initialize state and attempt to identify an immediate quiescent
// state as above (but only for idle tasks), unblock CPU-hotplug
@@ -847,7 +850,7 @@ static DEFINE_IRQ_WORK(rcu_tasks_trace_iw, rcu_read_unlock_iw);
/* If we are the last reader, wake up the grace-period kthread. */
void rcu_read_unlock_trace_special(struct task_struct *t, int nesting)
{
- int nq = t->trc_reader_special.b.need_qs;
+ int nq = READ_ONCE(t->trc_reader_special.b.need_qs);
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB) &&
t->trc_reader_special.b.need_mb)
@@ -894,7 +897,7 @@ static void trc_read_check_handler(void *t_in)
// If the task is not in a read-side critical section, and
// if this is the last reader, awaken the grace-period kthread.
- if (likely(!t->trc_reader_nesting)) {
+ if (likely(!READ_ONCE(t->trc_reader_nesting))) {
if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
wake_up(&trc_wait);
// Mark as checked after decrement to avoid false
@@ -903,7 +906,7 @@ static void trc_read_check_handler(void *t_in)
goto reset_ipi;
}
// If we are racing with an rcu_read_unlock_trace(), try again later.
- if (unlikely(t->trc_reader_nesting < 0)) {
+ if (unlikely(READ_ONCE(t->trc_reader_nesting) < 0)) {
if (WARN_ON_ONCE(atomic_dec_and_test(&trc_n_readers_need_end)))
wake_up(&trc_wait);
goto reset_ipi;
@@ -913,14 +916,14 @@ static void trc_read_check_handler(void *t_in)
// Get here if the task is in a read-side critical section. Set
// its state so that it will awaken the grace-period kthread upon
// exit from that critical section.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
reset_ipi:
// Allow future IPIs to be sent on CPU and for task.
// Also order this IPI handler against any later manipulations of
// the intended task.
- smp_store_release(&per_cpu(trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
+ smp_store_release(per_cpu_ptr(&trc_ipi_to_cpu, smp_processor_id()), false); // ^^^
smp_store_release(&texp->trc_ipi_to_cpu, -1); // ^^^
}
@@ -950,6 +953,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
n_heavy_reader_ofl_updates++;
in_qs = true;
} else {
+ // The task is not running, so C-language access is safe.
in_qs = likely(!t->trc_reader_nesting);
}
@@ -964,7 +968,7 @@ static bool trc_inspect_reader(struct task_struct *t, void *arg)
// state so that it will awaken the grace-period kthread upon exit
// from that critical section.
atomic_inc(&trc_n_readers_need_end); // One more to wait on.
- WARN_ON_ONCE(t->trc_reader_special.b.need_qs);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs));
WRITE_ONCE(t->trc_reader_special.b.need_qs, true);
return true;
}
@@ -982,7 +986,7 @@ static void trc_wait_for_one_reader(struct task_struct *t,
// The current task had better be in a quiescent state.
if (t == current) {
t->trc_reader_checked = true;
- WARN_ON_ONCE(t->trc_reader_nesting);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
return;
}
@@ -994,6 +998,12 @@ static void trc_wait_for_one_reader(struct task_struct *t,
}
put_task_struct(t);
+ // If this task is not yet on the holdout list, then we are in
+ // an RCU read-side critical section. Otherwise, the invocation of
+ // rcu_add_holdout() that added it to the list did the necessary
+ // get_task_struct(). Either way, the task cannot be freed out
+ // from under this code.
+
// If currently running, send an IPI, either way, add to list.
trc_add_holdout(t, bhp);
if (task_curr(t) &&
@@ -1092,8 +1102,8 @@ static void show_stalled_task_trace(struct task_struct *t, bool *firstreport)
".I"[READ_ONCE(t->trc_ipi_to_cpu) > 0],
".i"[is_idle_task(t)],
".N"[cpu > 0 && tick_nohz_full_cpu(cpu)],
- t->trc_reader_nesting,
- " N"[!!t->trc_reader_special.b.need_qs],
+ READ_ONCE(t->trc_reader_nesting),
+ " N"[!!READ_ONCE(t->trc_reader_special.b.need_qs)],
cpu);
sched_show_task(t);
}
@@ -1187,7 +1197,7 @@ static void rcu_tasks_trace_postgp(struct rcu_tasks *rtp)
static void exit_tasks_rcu_finish_trace(struct task_struct *t)
{
WRITE_ONCE(t->trc_reader_checked, true);
- WARN_ON_ONCE(t->trc_reader_nesting);
+ WARN_ON_ONCE(READ_ONCE(t->trc_reader_nesting));
WRITE_ONCE(t->trc_reader_nesting, 0);
if (WARN_ON_ONCE(READ_ONCE(t->trc_reader_special.b.need_qs)))
rcu_read_unlock_trace_special(t, 0);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 51f24ecd94b2..bce848e50512 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -74,17 +74,10 @@
/* Data structures. */
-/*
- * Steal a bit from the bottom of ->dynticks for idle entry/exit
- * control. Initially this is for TLB flushing.
- */
-#define RCU_DYNTICK_CTRL_MASK 0x1
-#define RCU_DYNTICK_CTRL_CTR (RCU_DYNTICK_CTRL_MASK + 1)
-
static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = {
.dynticks_nesting = 1,
.dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE,
- .dynticks = ATOMIC_INIT(RCU_DYNTICK_CTRL_CTR),
+ .dynticks = ATOMIC_INIT(1),
#ifdef CONFIG_RCU_NOCB_CPU
.cblist.flags = SEGCBLIST_SOFTIRQ_ONLY,
#endif
@@ -259,6 +252,15 @@ void rcu_softirq_qs(void)
}
/*
+ * Increment the current CPU's rcu_data structure's ->dynticks field
+ * with ordering. Return the new value.
+ */
+static noinline noinstr unsigned long rcu_dynticks_inc(int incby)
+{
+ return arch_atomic_add_return(incby, this_cpu_ptr(&rcu_data.dynticks));
+}
+
+/*
* Record entry into an extended quiescent state. This is only to be
* called when not already in an extended quiescent state, that is,
* RCU is watching prior to the call to this function and is no longer
@@ -266,7 +268,6 @@ void rcu_softirq_qs(void)
*/
static noinstr void rcu_dynticks_eqs_enter(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int seq;
/*
@@ -275,13 +276,9 @@ static noinstr void rcu_dynticks_eqs_enter(void)
* next idle sojourn.
*/
rcu_dynticks_task_trace_enter(); // Before ->dynticks update!
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = rcu_dynticks_inc(1);
// RCU is no longer watching. Better be in extended quiescent state!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_CTR));
- /* Better not have special action (TLB flush) pending! */
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- (seq & RCU_DYNTICK_CTRL_MASK));
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & 0x1));
}
/*
@@ -291,7 +288,6 @@ static noinstr void rcu_dynticks_eqs_enter(void)
*/
static noinstr void rcu_dynticks_eqs_exit(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
int seq;
/*
@@ -299,15 +295,10 @@ static noinstr void rcu_dynticks_eqs_exit(void)
* and we also must force ordering with the next RCU read-side
* critical section.
*/
- seq = arch_atomic_add_return(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ seq = rcu_dynticks_inc(1);
// RCU is now watching. Better not be in an extended quiescent state!
rcu_dynticks_task_trace_exit(); // After ->dynticks update!
- WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) &&
- !(seq & RCU_DYNTICK_CTRL_CTR));
- if (seq & RCU_DYNTICK_CTRL_MASK) {
- arch_atomic_andnot(RCU_DYNTICK_CTRL_MASK, &rdp->dynticks);
- smp_mb__after_atomic(); /* _exit after clearing mask. */
- }
+ WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & 0x1));
}
/*
@@ -324,9 +315,9 @@ static void rcu_dynticks_eqs_online(void)
{
struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
- if (atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR)
+ if (atomic_read(&rdp->dynticks) & 0x1)
return;
- atomic_add(RCU_DYNTICK_CTRL_CTR, &rdp->dynticks);
+ rcu_dynticks_inc(1);
}
/*
@@ -336,9 +327,7 @@ static void rcu_dynticks_eqs_online(void)
*/
static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
{
- struct rcu_data *rdp = this_cpu_ptr(&rcu_data);
-
- return !(arch_atomic_read(&rdp->dynticks) & RCU_DYNTICK_CTRL_CTR);
+ return !(atomic_read(this_cpu_ptr(&rcu_data.dynticks)) & 0x1);
}
/*
@@ -347,9 +336,8 @@ static __always_inline bool rcu_dynticks_curr_cpu_in_eqs(void)
*/
static int rcu_dynticks_snap(struct rcu_data *rdp)
{
- int snap = atomic_add_return(0, &rdp->dynticks);
-
- return snap & ~RCU_DYNTICK_CTRL_MASK;
+ smp_mb(); // Fundamental RCU ordering guarantee.
+ return atomic_read_acquire(&rdp->dynticks);
}
/*
@@ -358,7 +346,7 @@ static int rcu_dynticks_snap(struct rcu_data *rdp)
*/
static bool rcu_dynticks_in_eqs(int snap)
{
- return !(snap & RCU_DYNTICK_CTRL_CTR);
+ return !(snap & 0x1);
}
/* Return true if the specified CPU is currently idle from an RCU viewpoint. */
@@ -389,8 +377,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
int snap;
// If not quiescent, force back to earlier extended quiescent state.
- snap = atomic_read(&rdp->dynticks) & ~(RCU_DYNTICK_CTRL_MASK |
- RCU_DYNTICK_CTRL_CTR);
+ snap = atomic_read(&rdp->dynticks) & ~0x1;
smp_rmb(); // Order ->dynticks and *vp reads.
if (READ_ONCE(*vp))
@@ -398,32 +385,7 @@ bool rcu_dynticks_zero_in_eqs(int cpu, int *vp)
smp_rmb(); // Order *vp read and ->dynticks re-read.
// If still in the same extended quiescent state, we are good!
- return snap == (atomic_read(&rdp->dynticks) & ~RCU_DYNTICK_CTRL_MASK);
-}
-
-/*
- * Set the special (bottom) bit of the specified CPU so that it
- * will take special action (such as flushing its TLB) on the
- * next exit from an extended quiescent state. Returns true if
- * the bit was successfully set, or false if the CPU was not in
- * an extended quiescent state.
- */
-bool rcu_eqs_special_set(int cpu)
-{
- int old;
- int new;
- int new_old;
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-
- new_old = atomic_read(&rdp->dynticks);
- do {
- old = new_old;
- if (old & RCU_DYNTICK_CTRL_CTR)
- return false;
- new = old | RCU_DYNTICK_CTRL_MASK;
- new_old = atomic_cmpxchg(&rdp->dynticks, old, new);
- } while (new_old != old);
- return true;
+ return snap == atomic_read(&rdp->dynticks);
}
/*
@@ -439,13 +401,12 @@ bool rcu_eqs_special_set(int cpu)
*/
notrace void rcu_momentary_dyntick_idle(void)
{
- int special;
+ int seq;
raw_cpu_write(rcu_data.rcu_need_heavy_qs, false);
- special = atomic_add_return(2 * RCU_DYNTICK_CTRL_CTR,
- &this_cpu_ptr(&rcu_data)->dynticks);
+ seq = rcu_dynticks_inc(2);
/* It is illegal to call this from idle state. */
- WARN_ON_ONCE(!(special & RCU_DYNTICK_CTRL_CTR));
+ WARN_ON_ONCE(!(seq & 0x1));
rcu_preempt_deferred_qs(current);
}
EXPORT_SYMBOL_GPL(rcu_momentary_dyntick_idle);
@@ -1325,7 +1286,7 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp)
*/
jtsq = READ_ONCE(jiffies_to_sched_qs);
ruqp = per_cpu_ptr(&rcu_data.rcu_urgent_qs, rdp->cpu);
- rnhqp = &per_cpu(rcu_data.rcu_need_heavy_qs, rdp->cpu);
+ rnhqp = per_cpu_ptr(&rcu_data.rcu_need_heavy_qs, rdp->cpu);
if (!READ_ONCE(*rnhqp) &&
(time_after(jiffies, rcu_state.gp_start + jtsq * 2) ||
time_after(jiffies, rcu_state.jiffies_resched) ||
@@ -1772,7 +1733,7 @@ static void rcu_strict_gp_boundary(void *unused)
/*
* Initialize a new grace period. Return false if no grace period required.
*/
-static bool rcu_gp_init(void)
+static noinline_for_stack bool rcu_gp_init(void)
{
unsigned long firstseq;
unsigned long flags;
@@ -1966,7 +1927,7 @@ static void rcu_gp_fqs(bool first_time)
/*
* Loop doing repeated quiescent-state forcing until the grace period ends.
*/
-static void rcu_gp_fqs_loop(void)
+static noinline_for_stack void rcu_gp_fqs_loop(void)
{
bool first_gp_fqs;
int gf = 0;
@@ -1993,8 +1954,8 @@ static void rcu_gp_fqs_loop(void)
trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq,
TPS("fqswait"));
WRITE_ONCE(rcu_state.gp_state, RCU_GP_WAIT_FQS);
- ret = swait_event_idle_timeout_exclusive(
- rcu_state.gp_wq, rcu_gp_fqs_check_wake(&gf), j);
+ (void)swait_event_idle_timeout_exclusive(rcu_state.gp_wq,
+ rcu_gp_fqs_check_wake(&gf), j);
rcu_gp_torture_wait();
WRITE_ONCE(rcu_state.gp_state, RCU_GP_DOING_FQS);
/* Locking provides needed memory barriers. */
@@ -2471,9 +2432,6 @@ int rcutree_dead_cpu(unsigned int cpu)
WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1);
/* Adjust any no-longer-needed kthreads. */
rcu_boost_kthread_setaffinity(rnp, -1);
- /* Do any needed no-CB deferred wakeups from this CPU. */
- do_nocb_deferred_wakeup(per_cpu_ptr(&rcu_data, cpu));
-
// Stop-machine done, so allow nohz_full to disable tick.
tick_dep_clear(TICK_DEP_BIT_RCU);
return 0;
@@ -4050,7 +4008,7 @@ void rcu_barrier(void)
*/
init_completion(&rcu_state.barrier_completion);
atomic_set(&rcu_state.barrier_cpu_count, 2);
- get_online_cpus();
+ cpus_read_lock();
/*
* Force each CPU with callbacks to register a new callback.
@@ -4081,7 +4039,7 @@ void rcu_barrier(void)
rcu_state.barrier_sequence);
}
}
- put_online_cpus();
+ cpus_read_unlock();
/*
* Now that we have an rcu_barrier_callback() callback on each
@@ -4784,4 +4742,5 @@ void __init rcu_init(void)
#include "tree_stall.h"
#include "tree_exp.h"
+#include "tree_nocb.h"
#include "tree_plugin.h"
diff --git a/kernel/rcu/tree_nocb.h b/kernel/rcu/tree_nocb.h
new file mode 100644
index 000000000000..8fdf44f8523f
--- /dev/null
+++ b/kernel/rcu/tree_nocb.h
@@ -0,0 +1,1496 @@
+/* SPDX-License-Identifier: GPL-2.0+ */
+/*
+ * Read-Copy Update mechanism for mutual exclusion (tree-based version)
+ * Internal non-public definitions that provide either classic
+ * or preemptible semantics.
+ *
+ * Copyright Red Hat, 2009
+ * Copyright IBM Corporation, 2009
+ * Copyright SUSE, 2021
+ *
+ * Author: Ingo Molnar <mingo@elte.hu>
+ * Paul E. McKenney <paulmck@linux.ibm.com>
+ * Frederic Weisbecker <frederic@kernel.org>
+ */
+
+#ifdef CONFIG_RCU_NOCB_CPU
+static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
+static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return lockdep_is_held(&rdp->nocb_lock);
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ /* Race on early boot between thread creation and assignment */
+ if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
+ return true;
+
+ if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
+ if (in_task())
+ return true;
+ return false;
+}
+
+/*
+ * Offload callback processing from the boot-time-specified set of CPUs
+ * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
+ * created that pull the callbacks from the corresponding CPU, wait for
+ * a grace period to elapse, and invoke the callbacks. These kthreads
+ * are organized into GP kthreads, which manage incoming callbacks, wait for
+ * grace periods, and awaken CB kthreads, and the CB kthreads, which only
+ * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
+ * do a wake_up() on their GP kthread when they insert a callback into any
+ * empty list, unless the rcu_nocb_poll boot parameter has been specified,
+ * in which case each kthread actively polls its CPU. (Which isn't so great
+ * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
+ *
+ * This is intended to be used in conjunction with Frederic Weisbecker's
+ * adaptive-idle work, which would seriously reduce OS jitter on CPUs
+ * running CPU-bound user-mode computations.
+ *
+ * Offloading of callbacks can also be used as an energy-efficiency
+ * measure because CPUs with no RCU callbacks queued are more aggressive
+ * about entering dyntick-idle mode.
+ */
+
+
+/*
+ * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
+ * If the list is invalid, a warning is emitted and all CPUs are offloaded.
+ */
+static int __init rcu_nocb_setup(char *str)
+{
+ alloc_bootmem_cpumask_var(&rcu_nocb_mask);
+ if (cpulist_parse(str, rcu_nocb_mask)) {
+ pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
+ cpumask_setall(rcu_nocb_mask);
+ }
+ return 1;
+}
+__setup("rcu_nocbs=", rcu_nocb_setup);
+
+static int __init parse_rcu_nocb_poll(char *arg)
+{
+ rcu_nocb_poll = true;
+ return 0;
+}
+early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
+
+/*
+ * Don't bother bypassing ->cblist if the call_rcu() rate is low.
+ * After all, the main point of bypassing is to avoid lock contention
+ * on ->nocb_lock, which only can happen at high call_rcu() rates.
+ */
+static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
+module_param(nocb_nobypass_lim_per_jiffy, int, 0);
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
+ * lock isn't immediately available, increment ->nocb_lock_contended to
+ * flag the contention.
+ */
+static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
+ __acquires(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ if (raw_spin_trylock(&rdp->nocb_bypass_lock))
+ return;
+ atomic_inc(&rdp->nocb_lock_contended);
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ smp_mb__after_atomic(); /* atomic_inc() before lock. */
+ raw_spin_lock(&rdp->nocb_bypass_lock);
+ smp_mb__before_atomic(); /* atomic_dec() after lock. */
+ atomic_dec(&rdp->nocb_lock_contended);
+}
+
+/*
+ * Spinwait until the specified rcu_data structure's ->nocb_lock is
+ * not contended. Please note that this is extremely special-purpose,
+ * relying on the fact that at most two kthreads and one CPU contend for
+ * this lock, and also that the two kthreads are guaranteed to have frequent
+ * grace-period-duration time intervals between successive acquisitions
+ * of the lock. This allows us to use an extremely simple throttling
+ * mechanism, and further to apply it only to the CPU doing floods of
+ * call_rcu() invocations. Don't try this at home!
+ */
+static void rcu_nocb_wait_contended(struct rcu_data *rdp)
+{
+ WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
+ while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
+ cpu_relax();
+}
+
+/*
+ * Conditionally acquire the specified rcu_data structure's
+ * ->nocb_bypass_lock.
+ */
+static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ return raw_spin_trylock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_bypass_lock.
+ */
+static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
+ __releases(&rdp->nocb_bypass_lock)
+{
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+}
+
+/*
+ * Acquire the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (!rcu_rdp_is_offloaded(rdp))
+ return;
+ raw_spin_lock(&rdp->nocb_lock);
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock, but only
+ * if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock(&rdp->nocb_lock);
+ }
+}
+
+/*
+ * Release the specified rcu_data structure's ->nocb_lock and restore
+ * interrupts, but only if it corresponds to a no-CBs CPU.
+ */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ if (rcu_rdp_is_offloaded(rdp)) {
+ lockdep_assert_irqs_disabled();
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+ } else {
+ local_irq_restore(flags);
+ }
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+ if (rcu_rdp_is_offloaded(rdp))
+ lockdep_assert_held(&rdp->nocb_lock);
+}
+
+/*
+ * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
+ * grace period.
+ */
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+ swake_up_all(sq);
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+ init_swait_queue_head(&rnp->nocb_gp_wq[0]);
+ init_swait_queue_head(&rnp->nocb_gp_wq[1]);
+}
+
+/* Is the specified CPU a no-CBs CPU? */
+bool rcu_is_nocb_cpu(int cpu)
+{
+ if (cpumask_available(rcu_nocb_mask))
+ return cpumask_test_cpu(cpu, rcu_nocb_mask);
+ return false;
+}
+
+static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp,
+ bool force, unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ bool needwake = false;
+
+ if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("AlreadyAwake"));
+ return false;
+ }
+
+ if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&rdp_gp->nocb_timer);
+ }
+
+ if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
+ WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
+ needwake = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ if (needwake) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+ }
+
+ return needwake;
+}
+
+/*
+ * Kick the GP kthread for this NOCB group.
+ */
+static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return __wake_nocb_gp(rdp_gp, rdp, force, flags);
+}
+
+/*
+ * Arrange to wake the GP kthread for this NOCB group at some future
+ * time when it is safe to do so.
+ */
+static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
+ const char *reason)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+
+ /*
+ * Bypass wakeup overrides previous deferments. In case
+ * of callback storm, no need to wake up too early.
+ */
+ if (waketype == RCU_NOCB_WAKE_BYPASS) {
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ } else {
+ if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
+ mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
+ if (rdp_gp->nocb_defer_wakeup < waketype)
+ WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
+ }
+
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ struct rcu_cblist rcl;
+
+ WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
+ rcu_lockdep_assert_cblist_protected(rdp);
+ lockdep_assert_held(&rdp->nocb_bypass_lock);
+ if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
+ raw_spin_unlock(&rdp->nocb_bypass_lock);
+ return false;
+ }
+ /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
+ if (rhp)
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
+ rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ rcu_nocb_bypass_unlock(rdp);
+ return true;
+}
+
+/*
+ * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
+ * However, if there is a callback to be enqueued and if ->nocb_bypass
+ * proves to be initially empty, just return false because the no-CB GP
+ * kthread may need to be awakened in this case.
+ *
+ * Note that this function always returns true if rhp is NULL.
+ */
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ if (!rcu_rdp_is_offloaded(rdp))
+ return true;
+ rcu_lockdep_assert_cblist_protected(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ return rcu_nocb_do_flush_bypass(rdp, rhp, j);
+}
+
+/*
+ * If the ->nocb_bypass_lock is immediately available, flush the
+ * ->nocb_bypass queue into ->cblist.
+ */
+static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
+{
+ rcu_lockdep_assert_cblist_protected(rdp);
+ if (!rcu_rdp_is_offloaded(rdp) ||
+ !rcu_nocb_bypass_trylock(rdp))
+ return;
+ WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
+}
+
+/*
+ * See whether it is appropriate to use the ->nocb_bypass list in order
+ * to control contention on ->nocb_lock. A limited number of direct
+ * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
+ * is non-empty, further callbacks must be placed into ->nocb_bypass,
+ * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
+ * back to direct use of ->cblist. However, ->nocb_bypass should not be
+ * used if ->cblist is empty, because otherwise callbacks can be stranded
+ * on ->nocb_bypass because we cannot count on the current CPU ever again
+ * invoking call_rcu(). The general rule is that if ->nocb_bypass is
+ * non-empty, the corresponding no-CBs grace-period kthread must not be
+ * in an indefinite sleep state.
+ *
+ * Finally, it is not permitted to use the bypass during early boot,
+ * as doing so would confuse the auto-initialization code. Besides
+ * which, there is no point in worrying about lock contention while
+ * there is only one CPU in operation.
+ */
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
+{
+ unsigned long c;
+ unsigned long cur_gp_seq;
+ unsigned long j = jiffies;
+ long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+
+ lockdep_assert_irqs_disabled();
+
+ // Pure softirq/rcuc based processing: no bypassing, no
+ // locking.
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // In the process of (de-)offloading: no bypassing, but
+ // locking.
+ if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false; /* Not offloaded, no bypassing. */
+ }
+
+ // Don't use ->nocb_bypass during early boot.
+ if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
+ rcu_nocb_lock(rdp);
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ return false;
+ }
+
+ // If we have advanced to a new jiffy, reset counts to allow
+ // moving back from ->nocb_bypass to ->cblist.
+ if (j == rdp->nocb_nobypass_last) {
+ c = rdp->nocb_nobypass_count + 1;
+ } else {
+ WRITE_ONCE(rdp->nocb_nobypass_last, j);
+ c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
+ if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
+ nocb_nobypass_lim_per_jiffy))
+ c = 0;
+ else if (c > nocb_nobypass_lim_per_jiffy)
+ c = nocb_nobypass_lim_per_jiffy;
+ }
+ WRITE_ONCE(rdp->nocb_nobypass_count, c);
+
+ // If there hasn't yet been all that many ->cblist enqueues
+ // this jiffy, tell the caller to enqueue onto ->cblist. But flush
+ // ->nocb_bypass first.
+ if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
+ rcu_nocb_lock(rdp);
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+
+ // If ->nocb_bypass has been used too long or is too full,
+ // flush ->nocb_bypass to ->cblist.
+ if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
+ ncbs >= qhimark) {
+ rcu_nocb_lock(rdp);
+ if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
+ *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
+ if (*was_alldone)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstQ"));
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+ return false; // Caller must enqueue the callback.
+ }
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ return true; // Callback already enqueued.
+ }
+
+ // We need to use the bypass.
+ rcu_nocb_wait_contended(rdp);
+ rcu_nocb_bypass_lock(rdp);
+ ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
+ rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
+ if (!ncbs) {
+ WRITE_ONCE(rdp->nocb_bypass_first, j);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
+ }
+ rcu_nocb_bypass_unlock(rdp);
+ smp_mb(); /* Order enqueue before wake. */
+ if (ncbs) {
+ local_irq_restore(flags);
+ } else {
+ // No-CBs GP kthread might be indefinitely asleep, if so, wake.
+ rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
+ if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQwake"));
+ __call_rcu_nocb_wake(rdp, true, flags);
+ } else {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("FirstBQnoWake"));
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ }
+ }
+ return true; // Callback already enqueued.
+}
+
+/*
+ * Awaken the no-CBs grace-period kthread if needed, either due to it
+ * legitimately being asleep or due to overload conditions.
+ *
+ * If warranted, also wake up the kthread servicing this CPUs queues.
+ */
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
+ unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ unsigned long cur_gp_seq;
+ unsigned long j;
+ long len;
+ struct task_struct *t;
+
+ // If we are being polled or there is no kthread, just leave.
+ t = READ_ONCE(rdp->nocb_gp_kthread);
+ if (rcu_nocb_poll || !t) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeNotPoll"));
+ return;
+ }
+ // Need to actually to a wakeup.
+ len = rcu_segcblist_n_cbs(&rdp->cblist);
+ if (was_alldone) {
+ rdp->qlen_last_fqs_check = len;
+ if (!irqs_disabled_flags(flags)) {
+ /* ... if queue was empty ... */
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp(rdp, false);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("WakeEmpty"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
+ TPS("WakeEmptyIsDeferred"));
+ }
+ } else if (len > rdp->qlen_last_fqs_check + qhimark) {
+ /* ... or if many callbacks queued. */
+ rdp->qlen_last_fqs_check = len;
+ j = jiffies;
+ if (j != rdp->nocb_gp_adv_time &&
+ rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
+ rcu_advance_cbs_nowake(rdp->mynode, rdp);
+ rdp->nocb_gp_adv_time = j;
+ }
+ smp_mb(); /* Enqueue before timer_pending(). */
+ if ((rdp->nocb_cb_sleep ||
+ !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
+ !timer_pending(&rdp->nocb_timer)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
+ TPS("WakeOvfIsDeferred"));
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ } else {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
+ }
+ return;
+}
+
+/*
+ * Check if we ignore this rdp.
+ *
+ * We check that without holding the nocb lock but
+ * we make sure not to miss a freshly offloaded rdp
+ * with the current ordering:
+ *
+ * rdp_offload_toggle() nocb_gp_enabled_cb()
+ * ------------------------- ----------------------------
+ * WRITE flags LOCK nocb_gp_lock
+ * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
+ * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
+ * UNLOCK nocb_gp_lock READ flags
+ */
+static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
+
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
+ bool *needwake_state)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ }
+ return false;
+ }
+
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We will ignore this rdp until it ever gets re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
+ *needwake_state = true;
+ return true;
+}
+
+
+/*
+ * No-CBs GP kthreads come here to wait for additional callbacks to show up
+ * or for grace periods to end.
+ */
+static void nocb_gp_wait(struct rcu_data *my_rdp)
+{
+ bool bypass = false;
+ long bypass_ncbs;
+ int __maybe_unused cpu = my_rdp->cpu;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool gotcbs = false;
+ unsigned long j = jiffies;
+ bool needwait_gp = false; // This prevents actual uninitialized use.
+ bool needwake;
+ bool needwake_gp;
+ struct rcu_data *rdp;
+ struct rcu_node *rnp;
+ unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
+ bool wasempty = false;
+
+ /*
+ * Each pass through the following loop checks for CBs and for the
+ * nearest grace period (if any) to wait for next. The CB kthreads
+ * and the global grace-period kthread are awakened if needed.
+ */
+ WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
+ for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
+ bool needwake_state = false;
+
+ if (!nocb_gp_enabled_cb(rdp))
+ continue;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue;
+ }
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ if (bypass_ncbs &&
+ (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
+ bypass_ncbs > 2 * qhimark)) {
+ // Bypass full or old, so flush it.
+ (void)rcu_nocb_try_flush_bypass(rdp, j);
+ bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
+ } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ continue; /* No callbacks here, try next. */
+ }
+ if (bypass_ncbs) {
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("Bypass"));
+ bypass = true;
+ }
+ rnp = rdp->mynode;
+
+ // Advance callbacks if helpful and low contention.
+ needwake_gp = false;
+ if (!rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL) ||
+ (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
+ raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
+ needwake_gp = rcu_advance_cbs(rnp, rdp);
+ wasempty = rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL);
+ raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
+ }
+ // Need to wait on some grace period?
+ WARN_ON_ONCE(wasempty &&
+ !rcu_segcblist_restempty(&rdp->cblist,
+ RCU_NEXT_READY_TAIL));
+ if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
+ if (!needwait_gp ||
+ ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
+ wait_gp_seq = cur_gp_seq;
+ needwait_gp = true;
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
+ TPS("NeedWaitGP"));
+ }
+ if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
+ needwake = rdp->nocb_cb_sleep;
+ WRITE_ONCE(rdp->nocb_cb_sleep, false);
+ smp_mb(); /* CB invocation -after- GP end. */
+ } else {
+ needwake = false;
+ }
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake) {
+ swake_up_one(&rdp->nocb_cb_wq);
+ gotcbs = true;
+ }
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+ }
+
+ my_rdp->nocb_gp_bypass = bypass;
+ my_rdp->nocb_gp_gp = needwait_gp;
+ my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
+
+ if (bypass && !rcu_nocb_poll) {
+ // At least one child with non-empty ->nocb_bypass, so set
+ // timer in order to avoid stranding its callbacks.
+ wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
+ TPS("WakeBypassIsDeferred"));
+ }
+ if (rcu_nocb_poll) {
+ /* Polling, so trace if first poll in the series. */
+ if (gotcbs)
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
+ schedule_timeout_idle(1);
+ } else if (!needwait_gp) {
+ /* Wait for callbacks to appear. */
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
+ swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
+ } else {
+ rnp = my_rdp->mynode;
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
+ swait_event_interruptible_exclusive(
+ rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
+ rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
+ !READ_ONCE(my_rdp->nocb_gp_sleep));
+ trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
+ }
+ if (!rcu_nocb_poll) {
+ raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
+ if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
+ WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
+ del_timer(&my_rdp->nocb_timer);
+ }
+ WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
+ raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
+ }
+ my_rdp->nocb_gp_seq = -1;
+ WARN_ON(signal_pending(current));
+}
+
+/*
+ * No-CBs grace-period-wait kthread. There is one of these per group
+ * of CPUs, but only once at least one CPU in that group has come online
+ * at least once since boot. This kthread checks for newly posted
+ * callbacks from any of the CPUs it is responsible for, waits for a
+ * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
+ * that then have callback-invocation work to do.
+ */
+static int rcu_nocb_gp_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ for (;;) {
+ WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
+ nocb_gp_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+static inline bool nocb_cb_can_run(struct rcu_data *rdp)
+{
+ u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
+ return rcu_segcblist_test_flags(&rdp->cblist, flags);
+}
+
+static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
+{
+ return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
+}
+
+/*
+ * Invoke any ready callbacks from the corresponding no-CBs CPU,
+ * then, if there are no more, wait for more to appear.
+ */
+static void nocb_cb_wait(struct rcu_data *rdp)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long cur_gp_seq;
+ unsigned long flags;
+ bool needwake_state = false;
+ bool needwake_gp = false;
+ bool can_sleep = true;
+ struct rcu_node *rnp = rdp->mynode;
+
+ local_irq_save(flags);
+ rcu_momentary_dyntick_idle();
+ local_irq_restore(flags);
+ /*
+ * Disable BH to provide the expected environment. Also, when
+ * transitioning to/from NOCB mode, a self-requeuing callback might
+ * be invoked from softirq. A short grace period could cause both
+ * instances of this callback would execute concurrently.
+ */
+ local_bh_disable();
+ rcu_do_batch(rdp);
+ local_bh_enable();
+ lockdep_assert_irqs_enabled();
+ rcu_nocb_lock_irqsave(rdp, flags);
+ if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
+ rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
+ raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
+ needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
+ raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
+ }
+
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+ if (rcu_segcblist_ready_cbs(cblist))
+ can_sleep = false;
+ } else {
+ /*
+ * De-offloading. Clear our flag and notify the de-offload worker.
+ * We won't touch the callbacks and keep sleeping until we ever
+ * get re-offloaded.
+ */
+ WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
+ rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
+ if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
+ needwake_state = true;
+ }
+
+ WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
+
+ if (rdp->nocb_cb_sleep)
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
+
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+ if (needwake_gp)
+ rcu_gp_kthread_wake();
+
+ if (needwake_state)
+ swake_up_one(&rdp->nocb_state_wq);
+
+ do {
+ swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
+ nocb_cb_wait_cond(rdp));
+
+ // VVV Ensure CB invocation follows _sleep test.
+ if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
+ WARN_ON(signal_pending(current));
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
+ }
+ } while (!nocb_cb_can_run(rdp));
+}
+
+/*
+ * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
+ * nocb_cb_wait() to do the dirty work.
+ */
+static int rcu_nocb_cb_kthread(void *arg)
+{
+ struct rcu_data *rdp = arg;
+
+ // Each pass through this loop does one callback batch, and,
+ // if there are no more ready callbacks, waits for them.
+ for (;;) {
+ nocb_cb_wait(rdp);
+ cond_resched_tasks_rcu_qs();
+ }
+ return 0;
+}
+
+/* Is a deferred wakeup of rcu_nocb_kthread() required? */
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread(). */
+static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
+ struct rcu_data *rdp, int level,
+ unsigned long flags)
+ __releases(rdp_gp->nocb_gp_lock)
+{
+ int ndw;
+ int ret;
+
+ if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+ return false;
+ }
+
+ ndw = rdp_gp->nocb_defer_wakeup;
+ ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
+
+ return ret;
+}
+
+/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
+static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
+{
+ unsigned long flags;
+ struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
+
+ WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
+ trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
+
+ raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
+ smp_mb__after_spinlock(); /* Timer expire before wakeup. */
+ do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
+}
+
+/*
+ * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
+ * This means we do an inexact common-case check. Note that if
+ * we miss, ->nocb_timer will eventually clean things up.
+ */
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ unsigned long flags;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+
+ if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
+ return false;
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
+}
+
+void rcu_nocb_flush_deferred_wakeup(void)
+{
+ do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
+
+static int rdp_offload_toggle(struct rcu_data *rdp,
+ bool offload, unsigned long flags)
+ __releases(rdp->nocb_lock)
+{
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
+ bool wake_gp = false;
+
+ rcu_segcblist_offload(cblist, offload);
+
+ if (rdp->nocb_cb_sleep)
+ rdp->nocb_cb_sleep = false;
+ rcu_nocb_unlock_irqrestore(rdp, flags);
+
+ /*
+ * Ignore former value of nocb_cb_sleep and force wake up as it could
+ * have been spuriously set to false already.
+ */
+ swake_up_one(&rdp->nocb_cb_wq);
+
+ raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
+ if (rdp_gp->nocb_gp_sleep) {
+ rdp_gp->nocb_gp_sleep = false;
+ wake_gp = true;
+ }
+ raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
+
+ if (wake_gp)
+ wake_up_process(rdp_gp->nocb_gp_kthread);
+
+ return 0;
+}
+
+static long rcu_nocb_rdp_deoffload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+
+ pr_info("De-offloading %d\n", rdp->cpu);
+
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Flush once and for all now. This suffices because we are
+ * running on the target CPU holding ->nocb_lock (thus having
+ * interrupts disabled), and because rdp_offload_toggle()
+ * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
+ * Thus future calls to rcu_segcblist_completely_offloaded() will
+ * return false, which means that future calls to rcu_nocb_try_bypass()
+ * will refuse to put anything into the bypass.
+ */
+ WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
+ ret = rdp_offload_toggle(rdp, false, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
+ SEGCBLIST_KTHREAD_GP));
+ /*
+ * Lock one last time to acquire latest callback updates from kthreads
+ * so we can later handle callbacks locally without locking.
+ */
+ rcu_nocb_lock_irqsave(rdp, flags);
+ /*
+ * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
+ * lock is released but how about being paranoid for once?
+ */
+ rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
+ /*
+ * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
+ * rcu_nocb_unlock_irqrestore() anymore.
+ */
+ raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
+
+ /* Sanity check */
+ WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
+
+
+ return ret;
+}
+
+int rcu_nocb_cpu_deoffload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
+ if (!ret)
+ cpumask_clear_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
+ ret = -EINVAL;
+ }
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
+
+static long rcu_nocb_rdp_offload(void *arg)
+{
+ struct rcu_data *rdp = arg;
+ struct rcu_segcblist *cblist = &rdp->cblist;
+ unsigned long flags;
+ int ret;
+
+ WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
+ /*
+ * For now we only support re-offload, ie: the rdp must have been
+ * offloaded on boot first.
+ */
+ if (!rdp->nocb_gp_rdp)
+ return -EINVAL;
+
+ pr_info("Offloading %d\n", rdp->cpu);
+ /*
+ * Can't use rcu_nocb_lock_irqsave() while we are in
+ * SEGCBLIST_SOFTIRQ_ONLY mode.
+ */
+ raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
+
+ /*
+ * We didn't take the nocb lock while working on the
+ * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
+ * Every modifications that have been done previously on
+ * rdp->cblist must be visible remotely by the nocb kthreads
+ * upon wake up after reading the cblist flags.
+ *
+ * The layout against nocb_lock enforces that ordering:
+ *
+ * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
+ * ------------------------- ----------------------------
+ * WRITE callbacks rcu_nocb_lock()
+ * rcu_nocb_lock() READ flags
+ * WRITE flags READ callbacks
+ * rcu_nocb_unlock() rcu_nocb_unlock()
+ */
+ ret = rdp_offload_toggle(rdp, true, flags);
+ swait_event_exclusive(rdp->nocb_state_wq,
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
+ rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
+
+ return ret;
+}
+
+int rcu_nocb_cpu_offload(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ int ret = 0;
+
+ mutex_lock(&rcu_state.barrier_mutex);
+ cpus_read_lock();
+ if (!rcu_rdp_is_offloaded(rdp)) {
+ if (cpu_online(cpu)) {
+ ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
+ if (!ret)
+ cpumask_set_cpu(cpu, rcu_nocb_mask);
+ } else {
+ pr_info("NOCB: Can't CB-offload an offline CPU\n");
+ ret = -EINVAL;
+ }
+ }
+ cpus_read_unlock();
+ mutex_unlock(&rcu_state.barrier_mutex);
+
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
+
+void __init rcu_init_nohz(void)
+{
+ int cpu;
+ bool need_rcu_nocb_mask = false;
+ struct rcu_data *rdp;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
+ need_rcu_nocb_mask = true;
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
+ if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
+ pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
+ return;
+ }
+ }
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+
+#if defined(CONFIG_NO_HZ_FULL)
+ if (tick_nohz_full_running)
+ cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
+#endif /* #if defined(CONFIG_NO_HZ_FULL) */
+
+ if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
+ pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
+ cpumask_and(rcu_nocb_mask, cpu_possible_mask,
+ rcu_nocb_mask);
+ }
+ if (cpumask_empty(rcu_nocb_mask))
+ pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
+ else
+ pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
+ cpumask_pr_args(rcu_nocb_mask));
+ if (rcu_nocb_poll)
+ pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
+
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rcu_segcblist_empty(&rdp->cblist))
+ rcu_segcblist_init(&rdp->cblist);
+ rcu_segcblist_offload(&rdp->cblist, true);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
+ rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
+ }
+ rcu_organize_nocb_kthreads();
+}
+
+/* Initialize per-rcu_data variables for no-CBs CPUs. */
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+ init_swait_queue_head(&rdp->nocb_cb_wq);
+ init_swait_queue_head(&rdp->nocb_gp_wq);
+ init_swait_queue_head(&rdp->nocb_state_wq);
+ raw_spin_lock_init(&rdp->nocb_lock);
+ raw_spin_lock_init(&rdp->nocb_bypass_lock);
+ raw_spin_lock_init(&rdp->nocb_gp_lock);
+ timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
+ rcu_cblist_init(&rdp->nocb_bypass);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
+ * for this CPU's group has not yet been created, spawn it as well.
+ */
+static void rcu_spawn_one_nocb_kthread(int cpu)
+{
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
+ struct rcu_data *rdp_gp;
+ struct task_struct *t;
+
+ /*
+ * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
+ * then nothing to do.
+ */
+ if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
+ return;
+
+ /* If we didn't spawn the GP kthread first, reorganize! */
+ rdp_gp = rdp->nocb_gp_rdp;
+ if (!rdp_gp->nocb_gp_kthread) {
+ t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
+ "rcuog/%d", rdp_gp->cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
+ }
+
+ /* Spawn the kthread for this CPU. */
+ t = kthread_run(rcu_nocb_cb_kthread, rdp,
+ "rcuo%c/%d", rcu_state.abbr, cpu);
+ if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
+ return;
+ WRITE_ONCE(rdp->nocb_cb_kthread, t);
+ WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
+}
+
+/*
+ * If the specified CPU is a no-CBs CPU that does not already have its
+ * rcuo kthread, spawn it.
+ */
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+ if (rcu_scheduler_fully_active)
+ rcu_spawn_one_nocb_kthread(cpu);
+}
+
+/*
+ * Once the scheduler is running, spawn rcuo kthreads for all online
+ * no-CBs CPUs. This assumes that the early_initcall()s happen before
+ * non-boot CPUs come online -- if this changes, we will need to add
+ * some mutual exclusion.
+ */
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+ int cpu;
+
+ for_each_online_cpu(cpu)
+ rcu_spawn_cpu_nocb_kthread(cpu);
+}
+
+/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
+static int rcu_nocb_gp_stride = -1;
+module_param(rcu_nocb_gp_stride, int, 0444);
+
+/*
+ * Initialize GP-CB relationships for all no-CBs CPU.
+ */
+static void __init rcu_organize_nocb_kthreads(void)
+{
+ int cpu;
+ bool firsttime = true;
+ bool gotnocbs = false;
+ bool gotnocbscbs = true;
+ int ls = rcu_nocb_gp_stride;
+ int nl = 0; /* Next GP kthread. */
+ struct rcu_data *rdp;
+ struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
+ struct rcu_data *rdp_prev = NULL;
+
+ if (!cpumask_available(rcu_nocb_mask))
+ return;
+ if (ls == -1) {
+ ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
+ rcu_nocb_gp_stride = ls;
+ }
+
+ /*
+ * Each pass through this loop sets up one rcu_data structure.
+ * Should the corresponding CPU come online in the future, then
+ * we will spawn the needed set of rcu_nocb_kthread() kthreads.
+ */
+ for_each_cpu(cpu, rcu_nocb_mask) {
+ rdp = per_cpu_ptr(&rcu_data, cpu);
+ if (rdp->cpu >= nl) {
+ /* New GP kthread, set up for CBs & next GP. */
+ gotnocbs = true;
+ nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
+ rdp->nocb_gp_rdp = rdp;
+ rdp_gp = rdp;
+ if (dump_tree) {
+ if (!firsttime)
+ pr_cont("%s\n", gotnocbscbs
+ ? "" : " (self only)");
+ gotnocbscbs = false;
+ firsttime = false;
+ pr_alert("%s: No-CB GP kthread CPU %d:",
+ __func__, cpu);
+ }
+ } else {
+ /* Another CB kthread, link to previous GP kthread. */
+ gotnocbscbs = true;
+ rdp->nocb_gp_rdp = rdp_gp;
+ rdp_prev->nocb_next_cb_rdp = rdp;
+ if (dump_tree)
+ pr_cont(" %d", cpu);
+ }
+ rdp_prev = rdp;
+ }
+ if (gotnocbs && dump_tree)
+ pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
+}
+
+/*
+ * Bind the current task to the offloaded CPUs. If there are no offloaded
+ * CPUs, leave the task unbound. Splat if the bind attempt fails.
+ */
+void rcu_bind_current_to_nocb(void)
+{
+ if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
+ WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
+}
+EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
+
+// The ->on_cpu field is available only in CONFIG_SMP=y, so...
+#ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
+}
+#else // #ifdef CONFIG_SMP
+static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
+{
+ return "";
+}
+#endif // #else #ifdef CONFIG_SMP
+
+/*
+ * Dump out nocb grace-period kthread state for the specified rcu_data
+ * structure.
+ */
+static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
+{
+ struct rcu_node *rnp = rdp->mynode;
+
+ pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
+ rdp->cpu,
+ "kK"[!!rdp->nocb_gp_kthread],
+ "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "tT"[timer_pending(&rdp->nocb_timer)],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[swait_active(&rdp->nocb_gp_wq)],
+ ".W"[swait_active(&rnp->nocb_gp_wq[0])],
+ ".W"[swait_active(&rnp->nocb_gp_wq[1])],
+ ".B"[!!rdp->nocb_gp_bypass],
+ ".G"[!!rdp->nocb_gp_gp],
+ (long)rdp->nocb_gp_seq,
+ rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
+ rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+}
+
+/* Dump out nocb kthread state for the specified rcu_data structure. */
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+ char bufw[20];
+ char bufr[20];
+ struct rcu_segcblist *rsclp = &rdp->cblist;
+ bool waslocked;
+ bool wassleep;
+
+ if (rdp->nocb_gp_rdp == rdp)
+ show_rcu_nocb_gp_state(rdp);
+
+ sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
+ sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
+ pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
+ rdp->cpu, rdp->nocb_gp_rdp->cpu,
+ rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
+ "kK"[!!rdp->nocb_cb_kthread],
+ "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
+ "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
+ "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
+ "sS"[!!rdp->nocb_cb_sleep],
+ ".W"[swait_active(&rdp->nocb_cb_wq)],
+ jiffies - rdp->nocb_bypass_first,
+ jiffies - rdp->nocb_nobypass_last,
+ rdp->nocb_nobypass_count,
+ ".D"[rcu_segcblist_ready_cbs(rsclp)],
+ ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
+ ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
+ rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
+ ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
+ ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
+ rcu_segcblist_n_cbs(&rdp->cblist),
+ rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
+ rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
+ show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
+
+ /* It is OK for GP kthreads to have GP state. */
+ if (rdp->nocb_gp_rdp == rdp)
+ return;
+
+ waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
+ wassleep = swait_active(&rdp->nocb_gp_wq);
+ if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
+ return; /* Nothing untoward. */
+
+ pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
+ "lL"[waslocked],
+ "dD"[!!rdp->nocb_defer_wakeup],
+ "sS"[!!rdp->nocb_gp_sleep],
+ ".W"[wassleep]);
+}
+
+#else /* #ifdef CONFIG_RCU_NOCB_CPU */
+
+static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
+{
+ return 0;
+}
+
+static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
+{
+ return false;
+}
+
+/* No ->nocb_lock to acquire. */
+static void rcu_nocb_lock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock(struct rcu_data *rdp)
+{
+}
+
+/* No ->nocb_lock to release. */
+static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
+ unsigned long flags)
+{
+ local_irq_restore(flags);
+}
+
+/* Lockdep check that ->cblist may be safely accessed. */
+static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
+{
+ lockdep_assert_irqs_disabled();
+}
+
+static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
+{
+}
+
+static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
+{
+ return NULL;
+}
+
+static void rcu_init_one_nocb(struct rcu_node *rnp)
+{
+}
+
+static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ unsigned long j)
+{
+ return true;
+}
+
+static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
+ bool *was_alldone, unsigned long flags)
+{
+ return false;
+}
+
+static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
+ unsigned long flags)
+{
+ WARN_ON_ONCE(1); /* Should be dead code! */
+}
+
+static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
+{
+}
+
+static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
+{
+ return false;
+}
+
+static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
+{
+ return false;
+}
+
+static void rcu_spawn_cpu_nocb_kthread(int cpu)
+{
+}
+
+static void __init rcu_spawn_nocb_kthreads(void)
+{
+}
+
+static void show_rcu_nocb_state(struct rcu_data *rdp)
+{
+}
+
+#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0ff5e4fb933e..d070059163d7 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -13,39 +13,6 @@
#include "../locking/rtmutex_common.h"
-#ifdef CONFIG_RCU_NOCB_CPU
-static cpumask_var_t rcu_nocb_mask; /* CPUs to have callbacks offloaded. */
-static bool __read_mostly rcu_nocb_poll; /* Offload kthread are to poll. */
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return lockdep_is_held(&rdp->nocb_lock);
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- /* Race on early boot between thread creation and assignment */
- if (!rdp->nocb_cb_kthread || !rdp->nocb_gp_kthread)
- return true;
-
- if (current == rdp->nocb_cb_kthread || current == rdp->nocb_gp_kthread)
- if (in_task())
- return true;
- return false;
-}
-
-#else
-static inline int rcu_lockdep_is_held_nocb(struct rcu_data *rdp)
-{
- return 0;
-}
-
-static inline bool rcu_current_is_nocb_kthread(struct rcu_data *rdp)
-{
- return false;
-}
-
-#endif /* #ifdef CONFIG_RCU_NOCB_CPU */
-
static bool rcu_rdp_is_offloaded(struct rcu_data *rdp)
{
/*
@@ -346,7 +313,7 @@ void rcu_note_context_switch(bool preempt)
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
- WARN_ON_ONCE(!preempt && rcu_preempt_depth() > 0);
+ WARN_ONCE(!preempt && rcu_preempt_depth() > 0, "Voluntary context switch within RCU read-side critical section!");
if (rcu_preempt_depth() > 0 &&
!t->rcu_read_unlock_special.b.blocked) {
@@ -405,17 +372,20 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
static void rcu_preempt_read_enter(void)
{
- current->rcu_read_lock_nesting++;
+ WRITE_ONCE(current->rcu_read_lock_nesting, READ_ONCE(current->rcu_read_lock_nesting) + 1);
}
static int rcu_preempt_read_exit(void)
{
- return --current->rcu_read_lock_nesting;
+ int ret = READ_ONCE(current->rcu_read_lock_nesting) - 1;
+
+ WRITE_ONCE(current->rcu_read_lock_nesting, ret);
+ return ret;
}
static void rcu_preempt_depth_set(int val)
{
- current->rcu_read_lock_nesting = val;
+ WRITE_ONCE(current->rcu_read_lock_nesting, val);
}
/*
@@ -1479,1460 +1449,6 @@ static void rcu_cleanup_after_idle(void)
#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */
-#ifdef CONFIG_RCU_NOCB_CPU
-
-/*
- * Offload callback processing from the boot-time-specified set of CPUs
- * specified by rcu_nocb_mask. For the CPUs in the set, there are kthreads
- * created that pull the callbacks from the corresponding CPU, wait for
- * a grace period to elapse, and invoke the callbacks. These kthreads
- * are organized into GP kthreads, which manage incoming callbacks, wait for
- * grace periods, and awaken CB kthreads, and the CB kthreads, which only
- * invoke callbacks. Each GP kthread invokes its own CBs. The no-CBs CPUs
- * do a wake_up() on their GP kthread when they insert a callback into any
- * empty list, unless the rcu_nocb_poll boot parameter has been specified,
- * in which case each kthread actively polls its CPU. (Which isn't so great
- * for energy efficiency, but which does reduce RCU's overhead on that CPU.)
- *
- * This is intended to be used in conjunction with Frederic Weisbecker's
- * adaptive-idle work, which would seriously reduce OS jitter on CPUs
- * running CPU-bound user-mode computations.
- *
- * Offloading of callbacks can also be used as an energy-efficiency
- * measure because CPUs with no RCU callbacks queued are more aggressive
- * about entering dyntick-idle mode.
- */
-
-
-/*
- * Parse the boot-time rcu_nocb_mask CPU list from the kernel parameters.
- * If the list is invalid, a warning is emitted and all CPUs are offloaded.
- */
-static int __init rcu_nocb_setup(char *str)
-{
- alloc_bootmem_cpumask_var(&rcu_nocb_mask);
- if (cpulist_parse(str, rcu_nocb_mask)) {
- pr_warn("rcu_nocbs= bad CPU range, all CPUs set\n");
- cpumask_setall(rcu_nocb_mask);
- }
- return 1;
-}
-__setup("rcu_nocbs=", rcu_nocb_setup);
-
-static int __init parse_rcu_nocb_poll(char *arg)
-{
- rcu_nocb_poll = true;
- return 0;
-}
-early_param("rcu_nocb_poll", parse_rcu_nocb_poll);
-
-/*
- * Don't bother bypassing ->cblist if the call_rcu() rate is low.
- * After all, the main point of bypassing is to avoid lock contention
- * on ->nocb_lock, which only can happen at high call_rcu() rates.
- */
-static int nocb_nobypass_lim_per_jiffy = 16 * 1000 / HZ;
-module_param(nocb_nobypass_lim_per_jiffy, int, 0);
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_bypass_lock. If the
- * lock isn't immediately available, increment ->nocb_lock_contended to
- * flag the contention.
- */
-static void rcu_nocb_bypass_lock(struct rcu_data *rdp)
- __acquires(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- if (raw_spin_trylock(&rdp->nocb_bypass_lock))
- return;
- atomic_inc(&rdp->nocb_lock_contended);
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- smp_mb__after_atomic(); /* atomic_inc() before lock. */
- raw_spin_lock(&rdp->nocb_bypass_lock);
- smp_mb__before_atomic(); /* atomic_dec() after lock. */
- atomic_dec(&rdp->nocb_lock_contended);
-}
-
-/*
- * Spinwait until the specified rcu_data structure's ->nocb_lock is
- * not contended. Please note that this is extremely special-purpose,
- * relying on the fact that at most two kthreads and one CPU contend for
- * this lock, and also that the two kthreads are guaranteed to have frequent
- * grace-period-duration time intervals between successive acquisitions
- * of the lock. This allows us to use an extremely simple throttling
- * mechanism, and further to apply it only to the CPU doing floods of
- * call_rcu() invocations. Don't try this at home!
- */
-static void rcu_nocb_wait_contended(struct rcu_data *rdp)
-{
- WARN_ON_ONCE(smp_processor_id() != rdp->cpu);
- while (WARN_ON_ONCE(atomic_read(&rdp->nocb_lock_contended)))
- cpu_relax();
-}
-
-/*
- * Conditionally acquire the specified rcu_data structure's
- * ->nocb_bypass_lock.
- */
-static bool rcu_nocb_bypass_trylock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- return raw_spin_trylock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_bypass_lock.
- */
-static void rcu_nocb_bypass_unlock(struct rcu_data *rdp)
- __releases(&rdp->nocb_bypass_lock)
-{
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_bypass_lock);
-}
-
-/*
- * Acquire the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (!rcu_rdp_is_offloaded(rdp))
- return;
- raw_spin_lock(&rdp->nocb_lock);
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock, but only
- * if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
- if (rcu_rdp_is_offloaded(rdp)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock(&rdp->nocb_lock);
- }
-}
-
-/*
- * Release the specified rcu_data structure's ->nocb_lock and restore
- * interrupts, but only if it corresponds to a no-CBs CPU.
- */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- if (rcu_rdp_is_offloaded(rdp)) {
- lockdep_assert_irqs_disabled();
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
- } else {
- local_irq_restore(flags);
- }
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
- if (rcu_rdp_is_offloaded(rdp))
- lockdep_assert_held(&rdp->nocb_lock);
-}
-
-/*
- * Wake up any no-CBs CPUs' kthreads that were waiting on the just-ended
- * grace period.
- */
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
- swake_up_all(sq);
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return &rnp->nocb_gp_wq[rcu_seq_ctr(rnp->gp_seq) & 0x1];
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
- init_swait_queue_head(&rnp->nocb_gp_wq[0]);
- init_swait_queue_head(&rnp->nocb_gp_wq[1]);
-}
-
-/* Is the specified CPU a no-CBs CPU? */
-bool rcu_is_nocb_cpu(int cpu)
-{
- if (cpumask_available(rcu_nocb_mask))
- return cpumask_test_cpu(cpu, rcu_nocb_mask);
- return false;
-}
-
-static bool __wake_nocb_gp(struct rcu_data *rdp_gp,
- struct rcu_data *rdp,
- bool force, unsigned long flags)
- __releases(rdp_gp->nocb_gp_lock)
-{
- bool needwake = false;
-
- if (!READ_ONCE(rdp_gp->nocb_gp_kthread)) {
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("AlreadyAwake"));
- return false;
- }
-
- if (rdp_gp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&rdp_gp->nocb_timer);
- }
-
- if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) {
- WRITE_ONCE(rdp_gp->nocb_gp_sleep, false);
- needwake = true;
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- if (needwake) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DoWake"));
- wake_up_process(rdp_gp->nocb_gp_kthread);
- }
-
- return needwake;
-}
-
-/*
- * Kick the GP kthread for this NOCB group.
- */
-static bool wake_nocb_gp(struct rcu_data *rdp, bool force)
-{
- unsigned long flags;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- return __wake_nocb_gp(rdp_gp, rdp, force, flags);
-}
-
-/*
- * Arrange to wake the GP kthread for this NOCB group at some future
- * time when it is safe to do so.
- */
-static void wake_nocb_gp_defer(struct rcu_data *rdp, int waketype,
- const char *reason)
-{
- unsigned long flags;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
-
- /*
- * Bypass wakeup overrides previous deferments. In case
- * of callback storm, no need to wake up too early.
- */
- if (waketype == RCU_NOCB_WAKE_BYPASS) {
- mod_timer(&rdp_gp->nocb_timer, jiffies + 2);
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
- } else {
- if (rdp_gp->nocb_defer_wakeup < RCU_NOCB_WAKE)
- mod_timer(&rdp_gp->nocb_timer, jiffies + 1);
- if (rdp_gp->nocb_defer_wakeup < waketype)
- WRITE_ONCE(rdp_gp->nocb_defer_wakeup, waketype);
- }
-
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, reason);
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_do_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- struct rcu_cblist rcl;
-
- WARN_ON_ONCE(!rcu_rdp_is_offloaded(rdp));
- rcu_lockdep_assert_cblist_protected(rdp);
- lockdep_assert_held(&rdp->nocb_bypass_lock);
- if (rhp && !rcu_cblist_n_cbs(&rdp->nocb_bypass)) {
- raw_spin_unlock(&rdp->nocb_bypass_lock);
- return false;
- }
- /* Note: ->cblist.len already accounts for ->nocb_bypass contents. */
- if (rhp)
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_flush_enqueue(&rcl, &rdp->nocb_bypass, rhp);
- rcu_segcblist_insert_pend_cbs(&rdp->cblist, &rcl);
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- rcu_nocb_bypass_unlock(rdp);
- return true;
-}
-
-/*
- * Flush the ->nocb_bypass queue into ->cblist, enqueuing rhp if non-NULL.
- * However, if there is a callback to be enqueued and if ->nocb_bypass
- * proves to be initially empty, just return false because the no-CB GP
- * kthread may need to be awakened in this case.
- *
- * Note that this function always returns true if rhp is NULL.
- */
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- if (!rcu_rdp_is_offloaded(rdp))
- return true;
- rcu_lockdep_assert_cblist_protected(rdp);
- rcu_nocb_bypass_lock(rdp);
- return rcu_nocb_do_flush_bypass(rdp, rhp, j);
-}
-
-/*
- * If the ->nocb_bypass_lock is immediately available, flush the
- * ->nocb_bypass queue into ->cblist.
- */
-static void rcu_nocb_try_flush_bypass(struct rcu_data *rdp, unsigned long j)
-{
- rcu_lockdep_assert_cblist_protected(rdp);
- if (!rcu_rdp_is_offloaded(rdp) ||
- !rcu_nocb_bypass_trylock(rdp))
- return;
- WARN_ON_ONCE(!rcu_nocb_do_flush_bypass(rdp, NULL, j));
-}
-
-/*
- * See whether it is appropriate to use the ->nocb_bypass list in order
- * to control contention on ->nocb_lock. A limited number of direct
- * enqueues are permitted into ->cblist per jiffy. If ->nocb_bypass
- * is non-empty, further callbacks must be placed into ->nocb_bypass,
- * otherwise rcu_barrier() breaks. Use rcu_nocb_flush_bypass() to switch
- * back to direct use of ->cblist. However, ->nocb_bypass should not be
- * used if ->cblist is empty, because otherwise callbacks can be stranded
- * on ->nocb_bypass because we cannot count on the current CPU ever again
- * invoking call_rcu(). The general rule is that if ->nocb_bypass is
- * non-empty, the corresponding no-CBs grace-period kthread must not be
- * in an indefinite sleep state.
- *
- * Finally, it is not permitted to use the bypass during early boot,
- * as doing so would confuse the auto-initialization code. Besides
- * which, there is no point in worrying about lock contention while
- * there is only one CPU in operation.
- */
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- unsigned long c;
- unsigned long cur_gp_seq;
- unsigned long j = jiffies;
- long ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
-
- lockdep_assert_irqs_disabled();
-
- // Pure softirq/rcuc based processing: no bypassing, no
- // locking.
- if (!rcu_rdp_is_offloaded(rdp)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // In the process of (de-)offloading: no bypassing, but
- // locking.
- if (!rcu_segcblist_completely_offloaded(&rdp->cblist)) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false; /* Not offloaded, no bypassing. */
- }
-
- // Don't use ->nocb_bypass during early boot.
- if (rcu_scheduler_active != RCU_SCHEDULER_RUNNING) {
- rcu_nocb_lock(rdp);
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- return false;
- }
-
- // If we have advanced to a new jiffy, reset counts to allow
- // moving back from ->nocb_bypass to ->cblist.
- if (j == rdp->nocb_nobypass_last) {
- c = rdp->nocb_nobypass_count + 1;
- } else {
- WRITE_ONCE(rdp->nocb_nobypass_last, j);
- c = rdp->nocb_nobypass_count - nocb_nobypass_lim_per_jiffy;
- if (ULONG_CMP_LT(rdp->nocb_nobypass_count,
- nocb_nobypass_lim_per_jiffy))
- c = 0;
- else if (c > nocb_nobypass_lim_per_jiffy)
- c = nocb_nobypass_lim_per_jiffy;
- }
- WRITE_ONCE(rdp->nocb_nobypass_count, c);
-
- // If there hasn't yet been all that many ->cblist enqueues
- // this jiffy, tell the caller to enqueue onto ->cblist. But flush
- // ->nocb_bypass first.
- if (rdp->nocb_nobypass_count < nocb_nobypass_lim_per_jiffy) {
- rcu_nocb_lock(rdp);
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, j));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
-
- // If ->nocb_bypass has been used too long or is too full,
- // flush ->nocb_bypass to ->cblist.
- if ((ncbs && j != READ_ONCE(rdp->nocb_bypass_first)) ||
- ncbs >= qhimark) {
- rcu_nocb_lock(rdp);
- if (!rcu_nocb_flush_bypass(rdp, rhp, j)) {
- *was_alldone = !rcu_segcblist_pend_cbs(&rdp->cblist);
- if (*was_alldone)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstQ"));
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
- return false; // Caller must enqueue the callback.
- }
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- return true; // Callback already enqueued.
- }
-
- // We need to use the bypass.
- rcu_nocb_wait_contended(rdp);
- rcu_nocb_bypass_lock(rdp);
- ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- rcu_segcblist_inc_len(&rdp->cblist); /* Must precede enqueue. */
- rcu_cblist_enqueue(&rdp->nocb_bypass, rhp);
- if (!ncbs) {
- WRITE_ONCE(rdp->nocb_bypass_first, j);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("FirstBQ"));
- }
- rcu_nocb_bypass_unlock(rdp);
- smp_mb(); /* Order enqueue before wake. */
- if (ncbs) {
- local_irq_restore(flags);
- } else {
- // No-CBs GP kthread might be indefinitely asleep, if so, wake.
- rcu_nocb_lock(rdp); // Rare during call_rcu() flood.
- if (!rcu_segcblist_pend_cbs(&rdp->cblist)) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQwake"));
- __call_rcu_nocb_wake(rdp, true, flags);
- } else {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("FirstBQnoWake"));
- rcu_nocb_unlock_irqrestore(rdp, flags);
- }
- }
- return true; // Callback already enqueued.
-}
-
-/*
- * Awaken the no-CBs grace-period kthread if needed, either due to it
- * legitimately being asleep or due to overload conditions.
- *
- * If warranted, also wake up the kthread servicing this CPUs queues.
- */
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_alldone,
- unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- unsigned long cur_gp_seq;
- unsigned long j;
- long len;
- struct task_struct *t;
-
- // If we are being polled or there is no kthread, just leave.
- t = READ_ONCE(rdp->nocb_gp_kthread);
- if (rcu_nocb_poll || !t) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeNotPoll"));
- return;
- }
- // Need to actually to a wakeup.
- len = rcu_segcblist_n_cbs(&rdp->cblist);
- if (was_alldone) {
- rdp->qlen_last_fqs_check = len;
- if (!irqs_disabled_flags(flags)) {
- /* ... if queue was empty ... */
- rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp(rdp, false);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("WakeEmpty"));
- } else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE,
- TPS("WakeEmptyIsDeferred"));
- }
- } else if (len > rdp->qlen_last_fqs_check + qhimark) {
- /* ... or if many callbacks queued. */
- rdp->qlen_last_fqs_check = len;
- j = jiffies;
- if (j != rdp->nocb_gp_adv_time &&
- rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rdp->mynode->gp_seq, cur_gp_seq)) {
- rcu_advance_cbs_nowake(rdp->mynode, rdp);
- rdp->nocb_gp_adv_time = j;
- }
- smp_mb(); /* Enqueue before timer_pending(). */
- if ((rdp->nocb_cb_sleep ||
- !rcu_segcblist_ready_cbs(&rdp->cblist)) &&
- !timer_pending(&rdp->nocb_timer)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- wake_nocb_gp_defer(rdp, RCU_NOCB_WAKE_FORCE,
- TPS("WakeOvfIsDeferred"));
- } else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- } else {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WakeNot"));
- }
- return;
-}
-
-/*
- * Check if we ignore this rdp.
- *
- * We check that without holding the nocb lock but
- * we make sure not to miss a freshly offloaded rdp
- * with the current ordering:
- *
- * rdp_offload_toggle() nocb_gp_enabled_cb()
- * ------------------------- ----------------------------
- * WRITE flags LOCK nocb_gp_lock
- * LOCK nocb_gp_lock READ/WRITE nocb_gp_sleep
- * READ/WRITE nocb_gp_sleep UNLOCK nocb_gp_lock
- * UNLOCK nocb_gp_lock READ flags
- */
-static inline bool nocb_gp_enabled_cb(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_GP;
-
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_gp_update_state_deoffloading(struct rcu_data *rdp,
- bool *needwake_state)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
-
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- }
- return false;
- }
-
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We will ignore this rdp until it ever gets re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_GP);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB))
- *needwake_state = true;
- return true;
-}
-
-
-/*
- * No-CBs GP kthreads come here to wait for additional callbacks to show up
- * or for grace periods to end.
- */
-static void nocb_gp_wait(struct rcu_data *my_rdp)
-{
- bool bypass = false;
- long bypass_ncbs;
- int __maybe_unused cpu = my_rdp->cpu;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool gotcbs = false;
- unsigned long j = jiffies;
- bool needwait_gp = false; // This prevents actual uninitialized use.
- bool needwake;
- bool needwake_gp;
- struct rcu_data *rdp;
- struct rcu_node *rnp;
- unsigned long wait_gp_seq = 0; // Suppress "use uninitialized" warning.
- bool wasempty = false;
-
- /*
- * Each pass through the following loop checks for CBs and for the
- * nearest grace period (if any) to wait for next. The CB kthreads
- * and the global grace-period kthread are awakened if needed.
- */
- WARN_ON_ONCE(my_rdp->nocb_gp_rdp != my_rdp);
- for (rdp = my_rdp; rdp; rdp = rdp->nocb_next_cb_rdp) {
- bool needwake_state = false;
-
- if (!nocb_gp_enabled_cb(rdp))
- continue;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Check"));
- rcu_nocb_lock_irqsave(rdp, flags);
- if (nocb_gp_update_state_deoffloading(rdp, &needwake_state)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue;
- }
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- if (bypass_ncbs &&
- (time_after(j, READ_ONCE(rdp->nocb_bypass_first) + 1) ||
- bypass_ncbs > 2 * qhimark)) {
- // Bypass full or old, so flush it.
- (void)rcu_nocb_try_flush_bypass(rdp, j);
- bypass_ncbs = rcu_cblist_n_cbs(&rdp->nocb_bypass);
- } else if (!bypass_ncbs && rcu_segcblist_empty(&rdp->cblist)) {
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- continue; /* No callbacks here, try next. */
- }
- if (bypass_ncbs) {
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("Bypass"));
- bypass = true;
- }
- rnp = rdp->mynode;
-
- // Advance callbacks if helpful and low contention.
- needwake_gp = false;
- if (!rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL) ||
- (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq))) {
- raw_spin_lock_rcu_node(rnp); /* irqs disabled. */
- needwake_gp = rcu_advance_cbs(rnp, rdp);
- wasempty = rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL);
- raw_spin_unlock_rcu_node(rnp); /* irqs disabled. */
- }
- // Need to wait on some grace period?
- WARN_ON_ONCE(wasempty &&
- !rcu_segcblist_restempty(&rdp->cblist,
- RCU_NEXT_READY_TAIL));
- if (rcu_segcblist_nextgp(&rdp->cblist, &cur_gp_seq)) {
- if (!needwait_gp ||
- ULONG_CMP_LT(cur_gp_seq, wait_gp_seq))
- wait_gp_seq = cur_gp_seq;
- needwait_gp = true;
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu,
- TPS("NeedWaitGP"));
- }
- if (rcu_segcblist_ready_cbs(&rdp->cblist)) {
- needwake = rdp->nocb_cb_sleep;
- WRITE_ONCE(rdp->nocb_cb_sleep, false);
- smp_mb(); /* CB invocation -after- GP end. */
- } else {
- needwake = false;
- }
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake) {
- swake_up_one(&rdp->nocb_cb_wq);
- gotcbs = true;
- }
- if (needwake_gp)
- rcu_gp_kthread_wake();
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
- }
-
- my_rdp->nocb_gp_bypass = bypass;
- my_rdp->nocb_gp_gp = needwait_gp;
- my_rdp->nocb_gp_seq = needwait_gp ? wait_gp_seq : 0;
-
- if (bypass && !rcu_nocb_poll) {
- // At least one child with non-empty ->nocb_bypass, so set
- // timer in order to avoid stranding its callbacks.
- wake_nocb_gp_defer(my_rdp, RCU_NOCB_WAKE_BYPASS,
- TPS("WakeBypassIsDeferred"));
- }
- if (rcu_nocb_poll) {
- /* Polling, so trace if first poll in the series. */
- if (gotcbs)
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Poll"));
- schedule_timeout_idle(1);
- } else if (!needwait_gp) {
- /* Wait for callbacks to appear. */
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("Sleep"));
- swait_event_interruptible_exclusive(my_rdp->nocb_gp_wq,
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_nocb_wake(rcu_state.name, cpu, TPS("EndSleep"));
- } else {
- rnp = my_rdp->mynode;
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("StartWait"));
- swait_event_interruptible_exclusive(
- rnp->nocb_gp_wq[rcu_seq_ctr(wait_gp_seq) & 0x1],
- rcu_seq_done(&rnp->gp_seq, wait_gp_seq) ||
- !READ_ONCE(my_rdp->nocb_gp_sleep));
- trace_rcu_this_gp(rnp, my_rdp, wait_gp_seq, TPS("EndWait"));
- }
- if (!rcu_nocb_poll) {
- raw_spin_lock_irqsave(&my_rdp->nocb_gp_lock, flags);
- if (my_rdp->nocb_defer_wakeup > RCU_NOCB_WAKE_NOT) {
- WRITE_ONCE(my_rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT);
- del_timer(&my_rdp->nocb_timer);
- }
- WRITE_ONCE(my_rdp->nocb_gp_sleep, true);
- raw_spin_unlock_irqrestore(&my_rdp->nocb_gp_lock, flags);
- }
- my_rdp->nocb_gp_seq = -1;
- WARN_ON(signal_pending(current));
-}
-
-/*
- * No-CBs grace-period-wait kthread. There is one of these per group
- * of CPUs, but only once at least one CPU in that group has come online
- * at least once since boot. This kthread checks for newly posted
- * callbacks from any of the CPUs it is responsible for, waits for a
- * grace period, then awakens all of the rcu_nocb_cb_kthread() instances
- * that then have callback-invocation work to do.
- */
-static int rcu_nocb_gp_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- for (;;) {
- WRITE_ONCE(rdp->nocb_gp_loops, rdp->nocb_gp_loops + 1);
- nocb_gp_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-static inline bool nocb_cb_can_run(struct rcu_data *rdp)
-{
- u8 flags = SEGCBLIST_OFFLOADED | SEGCBLIST_KTHREAD_CB;
- return rcu_segcblist_test_flags(&rdp->cblist, flags);
-}
-
-static inline bool nocb_cb_wait_cond(struct rcu_data *rdp)
-{
- return nocb_cb_can_run(rdp) && !READ_ONCE(rdp->nocb_cb_sleep);
-}
-
-/*
- * Invoke any ready callbacks from the corresponding no-CBs CPU,
- * then, if there are no more, wait for more to appear.
- */
-static void nocb_cb_wait(struct rcu_data *rdp)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long cur_gp_seq;
- unsigned long flags;
- bool needwake_state = false;
- bool needwake_gp = false;
- bool can_sleep = true;
- struct rcu_node *rnp = rdp->mynode;
-
- local_irq_save(flags);
- rcu_momentary_dyntick_idle();
- local_irq_restore(flags);
- /*
- * Disable BH to provide the expected environment. Also, when
- * transitioning to/from NOCB mode, a self-requeuing callback might
- * be invoked from softirq. A short grace period could cause both
- * instances of this callback would execute concurrently.
- */
- local_bh_disable();
- rcu_do_batch(rdp);
- local_bh_enable();
- lockdep_assert_irqs_enabled();
- rcu_nocb_lock_irqsave(rdp, flags);
- if (rcu_segcblist_nextgp(cblist, &cur_gp_seq) &&
- rcu_seq_done(&rnp->gp_seq, cur_gp_seq) &&
- raw_spin_trylock_rcu_node(rnp)) { /* irqs already disabled. */
- needwake_gp = rcu_advance_cbs(rdp->mynode, rdp);
- raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
- }
-
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_OFFLOADED)) {
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB)) {
- rcu_segcblist_set_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
- if (rcu_segcblist_ready_cbs(cblist))
- can_sleep = false;
- } else {
- /*
- * De-offloading. Clear our flag and notify the de-offload worker.
- * We won't touch the callbacks and keep sleeping until we ever
- * get re-offloaded.
- */
- WARN_ON_ONCE(!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB));
- rcu_segcblist_clear_flags(cblist, SEGCBLIST_KTHREAD_CB);
- if (!rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP))
- needwake_state = true;
- }
-
- WRITE_ONCE(rdp->nocb_cb_sleep, can_sleep);
-
- if (rdp->nocb_cb_sleep)
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("CBSleep"));
-
- rcu_nocb_unlock_irqrestore(rdp, flags);
- if (needwake_gp)
- rcu_gp_kthread_wake();
-
- if (needwake_state)
- swake_up_one(&rdp->nocb_state_wq);
-
- do {
- swait_event_interruptible_exclusive(rdp->nocb_cb_wq,
- nocb_cb_wait_cond(rdp));
-
- // VVV Ensure CB invocation follows _sleep test.
- if (smp_load_acquire(&rdp->nocb_cb_sleep)) { // ^^^
- WARN_ON(signal_pending(current));
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("WokeEmpty"));
- }
- } while (!nocb_cb_can_run(rdp));
-}
-
-/*
- * Per-rcu_data kthread, but only for no-CBs CPUs. Repeatedly invoke
- * nocb_cb_wait() to do the dirty work.
- */
-static int rcu_nocb_cb_kthread(void *arg)
-{
- struct rcu_data *rdp = arg;
-
- // Each pass through this loop does one callback batch, and,
- // if there are no more ready callbacks, waits for them.
- for (;;) {
- nocb_cb_wait(rdp);
- cond_resched_tasks_rcu_qs();
- }
- return 0;
-}
-
-/* Is a deferred wakeup of rcu_nocb_kthread() required? */
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
-{
- return READ_ONCE(rdp->nocb_defer_wakeup) >= level;
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread(). */
-static bool do_nocb_deferred_wakeup_common(struct rcu_data *rdp_gp,
- struct rcu_data *rdp, int level,
- unsigned long flags)
- __releases(rdp_gp->nocb_gp_lock)
-{
- int ndw;
- int ret;
-
- if (!rcu_nocb_need_deferred_wakeup(rdp_gp, level)) {
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
- return false;
- }
-
- ndw = rdp_gp->nocb_defer_wakeup;
- ret = __wake_nocb_gp(rdp_gp, rdp, ndw == RCU_NOCB_WAKE_FORCE, flags);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake"));
-
- return ret;
-}
-
-/* Do a deferred wakeup of rcu_nocb_kthread() from a timer handler. */
-static void do_nocb_deferred_wakeup_timer(struct timer_list *t)
-{
- unsigned long flags;
- struct rcu_data *rdp = from_timer(rdp, t, nocb_timer);
-
- WARN_ON_ONCE(rdp->nocb_gp_rdp != rdp);
- trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("Timer"));
-
- raw_spin_lock_irqsave(&rdp->nocb_gp_lock, flags);
- smp_mb__after_spinlock(); /* Timer expire before wakeup. */
- do_nocb_deferred_wakeup_common(rdp, rdp, RCU_NOCB_WAKE_BYPASS, flags);
-}
-
-/*
- * Do a deferred wakeup of rcu_nocb_kthread() from fastpath.
- * This means we do an inexact common-case check. Note that if
- * we miss, ->nocb_timer will eventually clean things up.
- */
-static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- unsigned long flags;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
-
- if (!rdp_gp || !rcu_nocb_need_deferred_wakeup(rdp_gp, RCU_NOCB_WAKE))
- return false;
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- return do_nocb_deferred_wakeup_common(rdp_gp, rdp, RCU_NOCB_WAKE, flags);
-}
-
-void rcu_nocb_flush_deferred_wakeup(void)
-{
- do_nocb_deferred_wakeup(this_cpu_ptr(&rcu_data));
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_flush_deferred_wakeup);
-
-static int rdp_offload_toggle(struct rcu_data *rdp,
- bool offload, unsigned long flags)
- __releases(rdp->nocb_lock)
-{
- struct rcu_segcblist *cblist = &rdp->cblist;
- struct rcu_data *rdp_gp = rdp->nocb_gp_rdp;
- bool wake_gp = false;
-
- rcu_segcblist_offload(cblist, offload);
-
- if (rdp->nocb_cb_sleep)
- rdp->nocb_cb_sleep = false;
- rcu_nocb_unlock_irqrestore(rdp, flags);
-
- /*
- * Ignore former value of nocb_cb_sleep and force wake up as it could
- * have been spuriously set to false already.
- */
- swake_up_one(&rdp->nocb_cb_wq);
-
- raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags);
- if (rdp_gp->nocb_gp_sleep) {
- rdp_gp->nocb_gp_sleep = false;
- wake_gp = true;
- }
- raw_spin_unlock_irqrestore(&rdp_gp->nocb_gp_lock, flags);
-
- if (wake_gp)
- wake_up_process(rdp_gp->nocb_gp_kthread);
-
- return 0;
-}
-
-static long rcu_nocb_rdp_deoffload(void *arg)
-{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long flags;
- int ret;
-
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
-
- pr_info("De-offloading %d\n", rdp->cpu);
-
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Flush once and for all now. This suffices because we are
- * running on the target CPU holding ->nocb_lock (thus having
- * interrupts disabled), and because rdp_offload_toggle()
- * invokes rcu_segcblist_offload(), which clears SEGCBLIST_OFFLOADED.
- * Thus future calls to rcu_segcblist_completely_offloaded() will
- * return false, which means that future calls to rcu_nocb_try_bypass()
- * will refuse to put anything into the bypass.
- */
- WARN_ON_ONCE(!rcu_nocb_flush_bypass(rdp, NULL, jiffies));
- ret = rdp_offload_toggle(rdp, false, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- !rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB |
- SEGCBLIST_KTHREAD_GP));
- /*
- * Lock one last time to acquire latest callback updates from kthreads
- * so we can later handle callbacks locally without locking.
- */
- rcu_nocb_lock_irqsave(rdp, flags);
- /*
- * Theoretically we could set SEGCBLIST_SOFTIRQ_ONLY after the nocb
- * lock is released but how about being paranoid for once?
- */
- rcu_segcblist_set_flags(cblist, SEGCBLIST_SOFTIRQ_ONLY);
- /*
- * With SEGCBLIST_SOFTIRQ_ONLY, we can't use
- * rcu_nocb_unlock_irqrestore() anymore.
- */
- raw_spin_unlock_irqrestore(&rdp->nocb_lock, flags);
-
- /* Sanity check */
- WARN_ON_ONCE(rcu_cblist_n_cbs(&rdp->nocb_bypass));
-
-
- return ret;
-}
-
-int rcu_nocb_cpu_deoffload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- mutex_lock(&rcu_state.barrier_mutex);
- cpus_read_lock();
- if (rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_deoffload, rdp);
- if (!ret)
- cpumask_clear_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Can't CB-deoffload an offline CPU\n");
- ret = -EINVAL;
- }
- }
- cpus_read_unlock();
- mutex_unlock(&rcu_state.barrier_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_deoffload);
-
-static long rcu_nocb_rdp_offload(void *arg)
-{
- struct rcu_data *rdp = arg;
- struct rcu_segcblist *cblist = &rdp->cblist;
- unsigned long flags;
- int ret;
-
- WARN_ON_ONCE(rdp->cpu != raw_smp_processor_id());
- /*
- * For now we only support re-offload, ie: the rdp must have been
- * offloaded on boot first.
- */
- if (!rdp->nocb_gp_rdp)
- return -EINVAL;
-
- pr_info("Offloading %d\n", rdp->cpu);
- /*
- * Can't use rcu_nocb_lock_irqsave() while we are in
- * SEGCBLIST_SOFTIRQ_ONLY mode.
- */
- raw_spin_lock_irqsave(&rdp->nocb_lock, flags);
-
- /*
- * We didn't take the nocb lock while working on the
- * rdp->cblist in SEGCBLIST_SOFTIRQ_ONLY mode.
- * Every modifications that have been done previously on
- * rdp->cblist must be visible remotely by the nocb kthreads
- * upon wake up after reading the cblist flags.
- *
- * The layout against nocb_lock enforces that ordering:
- *
- * __rcu_nocb_rdp_offload() nocb_cb_wait()/nocb_gp_wait()
- * ------------------------- ----------------------------
- * WRITE callbacks rcu_nocb_lock()
- * rcu_nocb_lock() READ flags
- * WRITE flags READ callbacks
- * rcu_nocb_unlock() rcu_nocb_unlock()
- */
- ret = rdp_offload_toggle(rdp, true, flags);
- swait_event_exclusive(rdp->nocb_state_wq,
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_CB) &&
- rcu_segcblist_test_flags(cblist, SEGCBLIST_KTHREAD_GP));
-
- return ret;
-}
-
-int rcu_nocb_cpu_offload(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- int ret = 0;
-
- mutex_lock(&rcu_state.barrier_mutex);
- cpus_read_lock();
- if (!rcu_rdp_is_offloaded(rdp)) {
- if (cpu_online(cpu)) {
- ret = work_on_cpu(cpu, rcu_nocb_rdp_offload, rdp);
- if (!ret)
- cpumask_set_cpu(cpu, rcu_nocb_mask);
- } else {
- pr_info("NOCB: Can't CB-offload an offline CPU\n");
- ret = -EINVAL;
- }
- }
- cpus_read_unlock();
- mutex_unlock(&rcu_state.barrier_mutex);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(rcu_nocb_cpu_offload);
-
-void __init rcu_init_nohz(void)
-{
- int cpu;
- bool need_rcu_nocb_mask = false;
- struct rcu_data *rdp;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running && cpumask_weight(tick_nohz_full_mask))
- need_rcu_nocb_mask = true;
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_available(rcu_nocb_mask) && need_rcu_nocb_mask) {
- if (!zalloc_cpumask_var(&rcu_nocb_mask, GFP_KERNEL)) {
- pr_info("rcu_nocb_mask allocation failed, callback offloading disabled.\n");
- return;
- }
- }
- if (!cpumask_available(rcu_nocb_mask))
- return;
-
-#if defined(CONFIG_NO_HZ_FULL)
- if (tick_nohz_full_running)
- cpumask_or(rcu_nocb_mask, rcu_nocb_mask, tick_nohz_full_mask);
-#endif /* #if defined(CONFIG_NO_HZ_FULL) */
-
- if (!cpumask_subset(rcu_nocb_mask, cpu_possible_mask)) {
- pr_info("\tNote: kernel parameter 'rcu_nocbs=', 'nohz_full', or 'isolcpus=' contains nonexistent CPUs.\n");
- cpumask_and(rcu_nocb_mask, cpu_possible_mask,
- rcu_nocb_mask);
- }
- if (cpumask_empty(rcu_nocb_mask))
- pr_info("\tOffload RCU callbacks from CPUs: (none).\n");
- else
- pr_info("\tOffload RCU callbacks from CPUs: %*pbl.\n",
- cpumask_pr_args(rcu_nocb_mask));
- if (rcu_nocb_poll)
- pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
-
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rcu_segcblist_empty(&rdp->cblist))
- rcu_segcblist_init(&rdp->cblist);
- rcu_segcblist_offload(&rdp->cblist, true);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_CB);
- rcu_segcblist_set_flags(&rdp->cblist, SEGCBLIST_KTHREAD_GP);
- }
- rcu_organize_nocb_kthreads();
-}
-
-/* Initialize per-rcu_data variables for no-CBs CPUs. */
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
- init_swait_queue_head(&rdp->nocb_cb_wq);
- init_swait_queue_head(&rdp->nocb_gp_wq);
- init_swait_queue_head(&rdp->nocb_state_wq);
- raw_spin_lock_init(&rdp->nocb_lock);
- raw_spin_lock_init(&rdp->nocb_bypass_lock);
- raw_spin_lock_init(&rdp->nocb_gp_lock);
- timer_setup(&rdp->nocb_timer, do_nocb_deferred_wakeup_timer, 0);
- rcu_cblist_init(&rdp->nocb_bypass);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo CB kthread, spawn it. Additionally, if the rcuo GP kthread
- * for this CPU's group has not yet been created, spawn it as well.
- */
-static void rcu_spawn_one_nocb_kthread(int cpu)
-{
- struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
- struct rcu_data *rdp_gp;
- struct task_struct *t;
-
- /*
- * If this isn't a no-CBs CPU or if it already has an rcuo kthread,
- * then nothing to do.
- */
- if (!rcu_is_nocb_cpu(cpu) || rdp->nocb_cb_kthread)
- return;
-
- /* If we didn't spawn the GP kthread first, reorganize! */
- rdp_gp = rdp->nocb_gp_rdp;
- if (!rdp_gp->nocb_gp_kthread) {
- t = kthread_run(rcu_nocb_gp_kthread, rdp_gp,
- "rcuog/%d", rdp_gp->cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo GP kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp_gp->nocb_gp_kthread, t);
- }
-
- /* Spawn the kthread for this CPU. */
- t = kthread_run(rcu_nocb_cb_kthread, rdp,
- "rcuo%c/%d", rcu_state.abbr, cpu);
- if (WARN_ONCE(IS_ERR(t), "%s: Could not start rcuo CB kthread, OOM is now expected behavior\n", __func__))
- return;
- WRITE_ONCE(rdp->nocb_cb_kthread, t);
- WRITE_ONCE(rdp->nocb_gp_kthread, rdp_gp->nocb_gp_kthread);
-}
-
-/*
- * If the specified CPU is a no-CBs CPU that does not already have its
- * rcuo kthread, spawn it.
- */
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
- if (rcu_scheduler_fully_active)
- rcu_spawn_one_nocb_kthread(cpu);
-}
-
-/*
- * Once the scheduler is running, spawn rcuo kthreads for all online
- * no-CBs CPUs. This assumes that the early_initcall()s happen before
- * non-boot CPUs come online -- if this changes, we will need to add
- * some mutual exclusion.
- */
-static void __init rcu_spawn_nocb_kthreads(void)
-{
- int cpu;
-
- for_each_online_cpu(cpu)
- rcu_spawn_cpu_nocb_kthread(cpu);
-}
-
-/* How many CB CPU IDs per GP kthread? Default of -1 for sqrt(nr_cpu_ids). */
-static int rcu_nocb_gp_stride = -1;
-module_param(rcu_nocb_gp_stride, int, 0444);
-
-/*
- * Initialize GP-CB relationships for all no-CBs CPU.
- */
-static void __init rcu_organize_nocb_kthreads(void)
-{
- int cpu;
- bool firsttime = true;
- bool gotnocbs = false;
- bool gotnocbscbs = true;
- int ls = rcu_nocb_gp_stride;
- int nl = 0; /* Next GP kthread. */
- struct rcu_data *rdp;
- struct rcu_data *rdp_gp = NULL; /* Suppress misguided gcc warn. */
- struct rcu_data *rdp_prev = NULL;
-
- if (!cpumask_available(rcu_nocb_mask))
- return;
- if (ls == -1) {
- ls = nr_cpu_ids / int_sqrt(nr_cpu_ids);
- rcu_nocb_gp_stride = ls;
- }
-
- /*
- * Each pass through this loop sets up one rcu_data structure.
- * Should the corresponding CPU come online in the future, then
- * we will spawn the needed set of rcu_nocb_kthread() kthreads.
- */
- for_each_cpu(cpu, rcu_nocb_mask) {
- rdp = per_cpu_ptr(&rcu_data, cpu);
- if (rdp->cpu >= nl) {
- /* New GP kthread, set up for CBs & next GP. */
- gotnocbs = true;
- nl = DIV_ROUND_UP(rdp->cpu + 1, ls) * ls;
- rdp->nocb_gp_rdp = rdp;
- rdp_gp = rdp;
- if (dump_tree) {
- if (!firsttime)
- pr_cont("%s\n", gotnocbscbs
- ? "" : " (self only)");
- gotnocbscbs = false;
- firsttime = false;
- pr_alert("%s: No-CB GP kthread CPU %d:",
- __func__, cpu);
- }
- } else {
- /* Another CB kthread, link to previous GP kthread. */
- gotnocbscbs = true;
- rdp->nocb_gp_rdp = rdp_gp;
- rdp_prev->nocb_next_cb_rdp = rdp;
- if (dump_tree)
- pr_cont(" %d", cpu);
- }
- rdp_prev = rdp;
- }
- if (gotnocbs && dump_tree)
- pr_cont("%s\n", gotnocbscbs ? "" : " (self only)");
-}
-
-/*
- * Bind the current task to the offloaded CPUs. If there are no offloaded
- * CPUs, leave the task unbound. Splat if the bind attempt fails.
- */
-void rcu_bind_current_to_nocb(void)
-{
- if (cpumask_available(rcu_nocb_mask) && cpumask_weight(rcu_nocb_mask))
- WARN_ON(sched_setaffinity(current->pid, rcu_nocb_mask));
-}
-EXPORT_SYMBOL_GPL(rcu_bind_current_to_nocb);
-
-// The ->on_cpu field is available only in CONFIG_SMP=y, so...
-#ifdef CONFIG_SMP
-static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
-{
- return tsp && task_is_running(tsp) && !tsp->on_cpu ? "!" : "";
-}
-#else // #ifdef CONFIG_SMP
-static char *show_rcu_should_be_on_cpu(struct task_struct *tsp)
-{
- return "";
-}
-#endif // #else #ifdef CONFIG_SMP
-
-/*
- * Dump out nocb grace-period kthread state for the specified rcu_data
- * structure.
- */
-static void show_rcu_nocb_gp_state(struct rcu_data *rdp)
-{
- struct rcu_node *rnp = rdp->mynode;
-
- pr_info("nocb GP %d %c%c%c%c%c %c[%c%c] %c%c:%ld rnp %d:%d %lu %c CPU %d%s\n",
- rdp->cpu,
- "kK"[!!rdp->nocb_gp_kthread],
- "lL"[raw_spin_is_locked(&rdp->nocb_gp_lock)],
- "dD"[!!rdp->nocb_defer_wakeup],
- "tT"[timer_pending(&rdp->nocb_timer)],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[swait_active(&rdp->nocb_gp_wq)],
- ".W"[swait_active(&rnp->nocb_gp_wq[0])],
- ".W"[swait_active(&rnp->nocb_gp_wq[1])],
- ".B"[!!rdp->nocb_gp_bypass],
- ".G"[!!rdp->nocb_gp_gp],
- (long)rdp->nocb_gp_seq,
- rnp->grplo, rnp->grphi, READ_ONCE(rdp->nocb_gp_loops),
- rdp->nocb_gp_kthread ? task_state_to_char(rdp->nocb_gp_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
-}
-
-/* Dump out nocb kthread state for the specified rcu_data structure. */
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
- char bufw[20];
- char bufr[20];
- struct rcu_segcblist *rsclp = &rdp->cblist;
- bool waslocked;
- bool wassleep;
-
- if (rdp->nocb_gp_rdp == rdp)
- show_rcu_nocb_gp_state(rdp);
-
- sprintf(bufw, "%ld", rsclp->gp_seq[RCU_WAIT_TAIL]);
- sprintf(bufr, "%ld", rsclp->gp_seq[RCU_NEXT_READY_TAIL]);
- pr_info(" CB %d^%d->%d %c%c%c%c%c%c F%ld L%ld C%d %c%c%s%c%s%c%c q%ld %c CPU %d%s\n",
- rdp->cpu, rdp->nocb_gp_rdp->cpu,
- rdp->nocb_next_cb_rdp ? rdp->nocb_next_cb_rdp->cpu : -1,
- "kK"[!!rdp->nocb_cb_kthread],
- "bB"[raw_spin_is_locked(&rdp->nocb_bypass_lock)],
- "cC"[!!atomic_read(&rdp->nocb_lock_contended)],
- "lL"[raw_spin_is_locked(&rdp->nocb_lock)],
- "sS"[!!rdp->nocb_cb_sleep],
- ".W"[swait_active(&rdp->nocb_cb_wq)],
- jiffies - rdp->nocb_bypass_first,
- jiffies - rdp->nocb_nobypass_last,
- rdp->nocb_nobypass_count,
- ".D"[rcu_segcblist_ready_cbs(rsclp)],
- ".W"[!rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL)],
- rcu_segcblist_segempty(rsclp, RCU_WAIT_TAIL) ? "" : bufw,
- ".R"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL)],
- rcu_segcblist_segempty(rsclp, RCU_NEXT_READY_TAIL) ? "" : bufr,
- ".N"[!rcu_segcblist_segempty(rsclp, RCU_NEXT_TAIL)],
- ".B"[!!rcu_cblist_n_cbs(&rdp->nocb_bypass)],
- rcu_segcblist_n_cbs(&rdp->cblist),
- rdp->nocb_cb_kthread ? task_state_to_char(rdp->nocb_cb_kthread) : '.',
- rdp->nocb_cb_kthread ? (int)task_cpu(rdp->nocb_gp_kthread) : -1,
- show_rcu_should_be_on_cpu(rdp->nocb_cb_kthread));
-
- /* It is OK for GP kthreads to have GP state. */
- if (rdp->nocb_gp_rdp == rdp)
- return;
-
- waslocked = raw_spin_is_locked(&rdp->nocb_gp_lock);
- wassleep = swait_active(&rdp->nocb_gp_wq);
- if (!rdp->nocb_gp_sleep && !waslocked && !wassleep)
- return; /* Nothing untoward. */
-
- pr_info(" nocb GP activity on CB-only CPU!!! %c%c%c %c\n",
- "lL"[waslocked],
- "dD"[!!rdp->nocb_defer_wakeup],
- "sS"[!!rdp->nocb_gp_sleep],
- ".W"[wassleep]);
-}
-
-#else /* #ifdef CONFIG_RCU_NOCB_CPU */
-
-/* No ->nocb_lock to acquire. */
-static void rcu_nocb_lock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock(struct rcu_data *rdp)
-{
-}
-
-/* No ->nocb_lock to release. */
-static void rcu_nocb_unlock_irqrestore(struct rcu_data *rdp,
- unsigned long flags)
-{
- local_irq_restore(flags);
-}
-
-/* Lockdep check that ->cblist may be safely accessed. */
-static void rcu_lockdep_assert_cblist_protected(struct rcu_data *rdp)
-{
- lockdep_assert_irqs_disabled();
-}
-
-static void rcu_nocb_gp_cleanup(struct swait_queue_head *sq)
-{
-}
-
-static struct swait_queue_head *rcu_nocb_gp_get(struct rcu_node *rnp)
-{
- return NULL;
-}
-
-static void rcu_init_one_nocb(struct rcu_node *rnp)
-{
-}
-
-static bool rcu_nocb_flush_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- unsigned long j)
-{
- return true;
-}
-
-static bool rcu_nocb_try_bypass(struct rcu_data *rdp, struct rcu_head *rhp,
- bool *was_alldone, unsigned long flags)
-{
- return false;
-}
-
-static void __call_rcu_nocb_wake(struct rcu_data *rdp, bool was_empty,
- unsigned long flags)
-{
- WARN_ON_ONCE(1); /* Should be dead code! */
-}
-
-static void __init rcu_boot_init_nocb_percpu_data(struct rcu_data *rdp)
-{
-}
-
-static int rcu_nocb_need_deferred_wakeup(struct rcu_data *rdp, int level)
-{
- return false;
-}
-
-static bool do_nocb_deferred_wakeup(struct rcu_data *rdp)
-{
- return false;
-}
-
-static void rcu_spawn_cpu_nocb_kthread(int cpu)
-{
-}
-
-static void __init rcu_spawn_nocb_kthreads(void)
-{
-}
-
-static void show_rcu_nocb_state(struct rcu_data *rdp)
-{
-}
-
-#endif /* #else #ifdef CONFIG_RCU_NOCB_CPU */
-
/*
* Is this CPU a NO_HZ_FULL CPU that should ignore RCU so that the
* grace-period kthread will do force_quiescent_state() processing?
@@ -2982,17 +1498,17 @@ static void noinstr rcu_dynticks_task_exit(void)
/* Turn on heavyweight RCU tasks trace readers on idle/user entry. */
static void rcu_dynticks_task_trace_enter(void)
{
-#ifdef CONFIG_TASKS_RCU_TRACE
+#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
current->trc_reader_special.b.need_mb = true;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
/* Turn off heavyweight RCU tasks trace readers on idle/user exit. */
static void rcu_dynticks_task_trace_exit(void)
{
-#ifdef CONFIG_TASKS_RCU_TRACE
+#ifdef CONFIG_TASKS_TRACE_RCU
if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB))
current->trc_reader_special.b.need_mb = false;
-#endif /* #ifdef CONFIG_TASKS_RCU_TRACE */
+#endif /* #ifdef CONFIG_TASKS_TRACE_RCU */
}
diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h
index 6c76988cc019..677ee3d8671b 100644
--- a/kernel/rcu/tree_stall.h
+++ b/kernel/rcu/tree_stall.h
@@ -7,6 +7,8 @@
* Author: Paul E. McKenney <paulmck@linux.ibm.com>
*/
+#include <linux/kvm_para.h>
+
//////////////////////////////////////////////////////////////////////////////
//
// Controlling CPU stall warnings, including delay calculation.
@@ -117,17 +119,14 @@ static void panic_on_rcu_stall(void)
}
/**
- * rcu_cpu_stall_reset - prevent further stall warnings in current grace period
- *
- * Set the stall-warning timeout way off into the future, thus preventing
- * any RCU CPU stall-warning messages from appearing in the current set of
- * RCU grace periods.
+ * rcu_cpu_stall_reset - restart stall-warning timeout for current grace period
*
* The caller must disable hard irqs.
*/
void rcu_cpu_stall_reset(void)
{
- WRITE_ONCE(rcu_state.jiffies_stall, jiffies + ULONG_MAX / 2);
+ WRITE_ONCE(rcu_state.jiffies_stall,
+ jiffies + rcu_jiffies_till_stall_check());
}
//////////////////////////////////////////////////////////////////////////////
@@ -267,8 +266,10 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
struct task_struct *ts[8];
lockdep_assert_irqs_disabled();
- if (!rcu_preempt_blocked_readers_cgp(rnp))
+ if (!rcu_preempt_blocked_readers_cgp(rnp)) {
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
return 0;
+ }
pr_err("\tTasks blocked on level-%d rcu_node (CPUs %d-%d):",
rnp->level, rnp->grplo, rnp->grphi);
t = list_entry(rnp->gp_tasks->prev,
@@ -280,8 +281,8 @@ static int rcu_print_task_stall(struct rcu_node *rnp, unsigned long flags)
break;
}
raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
- for (i--; i; i--) {
- t = ts[i];
+ while (i) {
+ t = ts[--i];
if (!try_invoke_on_locked_down_task(t, check_slow_task, &rscr))
pr_cont(" P%d", t->pid);
else
@@ -350,7 +351,7 @@ static void rcu_dump_cpu_stacks(void)
static void print_cpu_stall_fast_no_hz(char *cp, int cpu)
{
- struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+ struct rcu_data *rdp = per_cpu_ptr(&rcu_data, cpu);
sprintf(cp, "last_accelerate: %04lx/%04lx dyntick_enabled: %d",
rdp->last_accelerate & 0xffff, jiffies & 0xffff,
@@ -464,9 +465,10 @@ static void rcu_check_gp_kthread_starvation(void)
pr_err("%s kthread starved for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x ->cpu=%d\n",
rcu_state.name, j,
(long)rcu_seq_current(&rcu_state.gp_seq),
- data_race(rcu_state.gp_flags),
- gp_state_getname(rcu_state.gp_state), rcu_state.gp_state,
- gpk ? gpk->__state : ~0, cpu);
+ data_race(READ_ONCE(rcu_state.gp_flags)),
+ gp_state_getname(rcu_state.gp_state),
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ gpk ? data_race(READ_ONCE(gpk->__state)) : ~0, cpu);
if (gpk) {
pr_err("\tUnless %s kthread gets sufficient CPU time, OOM is now expected behavior.\n", rcu_state.name);
pr_err("RCU grace-period kthread stack dump:\n");
@@ -509,7 +511,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void)
(long)rcu_seq_current(&rcu_state.gp_seq),
data_race(rcu_state.gp_flags),
gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS,
- gpk->__state);
+ data_race(READ_ONCE(gpk->__state)));
pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n",
cpu, kstat_softirqs_cpu(TIMER_SOFTIRQ, cpu));
}
@@ -568,11 +570,11 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps)
pr_err("INFO: Stall ended before state dump start\n");
} else {
j = jiffies;
- gpa = data_race(rcu_state.gp_activity);
+ gpa = data_race(READ_ONCE(rcu_state.gp_activity));
pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
rcu_state.name, j - gpa, j, gpa,
- data_race(jiffies_till_next_fqs),
- rcu_get_root()->qsmask);
+ data_race(READ_ONCE(jiffies_till_next_fqs)),
+ data_race(READ_ONCE(rcu_get_root()->qsmask)));
}
}
/* Rewrite if needed in case of slow consoles. */
@@ -646,6 +648,7 @@ static void print_cpu_stall(unsigned long gps)
static void check_cpu_stall(struct rcu_data *rdp)
{
+ bool didstall = false;
unsigned long gs1;
unsigned long gs2;
unsigned long gps;
@@ -691,24 +694,46 @@ static void check_cpu_stall(struct rcu_data *rdp)
ULONG_CMP_GE(gps, js))
return; /* No stall or GP completed since entering function. */
rnp = rdp->mynode;
- jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ jn = jiffies + ULONG_MAX / 2;
if (rcu_gp_in_progress() &&
(READ_ONCE(rnp->qsmask) & rdp->grpmask) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* We haven't checked in, so go dump stack. */
print_cpu_stall(gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+ didstall = true;
} else if (rcu_gp_in_progress() &&
ULONG_CMP_GE(j, js + RCU_STALL_RAT_DELAY) &&
cmpxchg(&rcu_state.jiffies_stall, js, jn) == js) {
+ /*
+ * If a virtual machine is stopped by the host it can look to
+ * the watchdog like an RCU stall. Check to see if the host
+ * stopped the vm.
+ */
+ if (kvm_check_and_clear_guest_paused())
+ return;
+
/* They had a few time units to dump stack, so complain. */
print_other_cpu_stall(gs2, gps);
if (READ_ONCE(rcu_cpu_stall_ftrace_dump))
rcu_ftrace_dump(DUMP_ALL);
+ didstall = true;
+ }
+ if (didstall && READ_ONCE(rcu_state.jiffies_stall) == jn) {
+ jn = jiffies + 3 * rcu_jiffies_till_stall_check() + 3;
+ WRITE_ONCE(rcu_state.jiffies_stall, jn);
}
}
@@ -742,7 +767,7 @@ bool rcu_check_boost_fail(unsigned long gp_state, int *cpup)
rcu_for_each_leaf_node(rnp) {
if (!cpup) {
- if (READ_ONCE(rnp->qsmask)) {
+ if (data_race(READ_ONCE(rnp->qsmask))) {
return false;
} else {
if (READ_ONCE(rnp->gp_tasks))
@@ -791,32 +816,34 @@ void show_rcu_gp_kthreads(void)
struct task_struct *t = READ_ONCE(rcu_state.gp_kthread);
j = jiffies;
- ja = j - data_race(rcu_state.gp_activity);
- jr = j - data_race(rcu_state.gp_req_activity);
- js = j - data_race(rcu_state.gp_start);
- jw = j - data_race(rcu_state.gp_wake_time);
+ ja = j - data_race(READ_ONCE(rcu_state.gp_activity));
+ jr = j - data_race(READ_ONCE(rcu_state.gp_req_activity));
+ js = j - data_race(READ_ONCE(rcu_state.gp_start));
+ jw = j - data_race(READ_ONCE(rcu_state.gp_wake_time));
pr_info("%s: wait state: %s(%d) ->state: %#x ->rt_priority %u delta ->gp_start %lu ->gp_activity %lu ->gp_req_activity %lu ->gp_wake_time %lu ->gp_wake_seq %ld ->gp_seq %ld ->gp_seq_needed %ld ->gp_max %lu ->gp_flags %#x\n",
rcu_state.name, gp_state_getname(rcu_state.gp_state),
- rcu_state.gp_state, t ? t->__state : 0x1ffff, t ? t->rt_priority : 0xffU,
- js, ja, jr, jw, (long)data_race(rcu_state.gp_wake_seq),
- (long)data_race(rcu_state.gp_seq),
- (long)data_race(rcu_get_root()->gp_seq_needed),
- data_race(rcu_state.gp_max),
- data_race(rcu_state.gp_flags));
+ data_race(READ_ONCE(rcu_state.gp_state)),
+ t ? data_race(READ_ONCE(t->__state)) : 0x1ffff, t ? t->rt_priority : 0xffU,
+ js, ja, jr, jw, (long)data_race(READ_ONCE(rcu_state.gp_wake_seq)),
+ (long)data_race(READ_ONCE(rcu_state.gp_seq)),
+ (long)data_race(READ_ONCE(rcu_get_root()->gp_seq_needed)),
+ data_race(READ_ONCE(rcu_state.gp_max)),
+ data_race(READ_ONCE(rcu_state.gp_flags)));
rcu_for_each_node_breadth_first(rnp) {
if (ULONG_CMP_GE(READ_ONCE(rcu_state.gp_seq), READ_ONCE(rnp->gp_seq_needed)) &&
- !data_race(rnp->qsmask) && !data_race(rnp->boost_tasks) &&
- !data_race(rnp->exp_tasks) && !data_race(rnp->gp_tasks))
+ !data_race(READ_ONCE(rnp->qsmask)) && !data_race(READ_ONCE(rnp->boost_tasks)) &&
+ !data_race(READ_ONCE(rnp->exp_tasks)) && !data_race(READ_ONCE(rnp->gp_tasks)))
continue;
pr_info("\trcu_node %d:%d ->gp_seq %ld ->gp_seq_needed %ld ->qsmask %#lx %c%c%c%c ->n_boosts %ld\n",
rnp->grplo, rnp->grphi,
- (long)data_race(rnp->gp_seq), (long)data_race(rnp->gp_seq_needed),
- data_race(rnp->qsmask),
- ".b"[!!data_race(rnp->boost_kthread_task)],
- ".B"[!!data_race(rnp->boost_tasks)],
- ".E"[!!data_race(rnp->exp_tasks)],
- ".G"[!!data_race(rnp->gp_tasks)],
- data_race(rnp->n_boosts));
+ (long)data_race(READ_ONCE(rnp->gp_seq)),
+ (long)data_race(READ_ONCE(rnp->gp_seq_needed)),
+ data_race(READ_ONCE(rnp->qsmask)),
+ ".b"[!!data_race(READ_ONCE(rnp->boost_kthread_task))],
+ ".B"[!!data_race(READ_ONCE(rnp->boost_tasks))],
+ ".E"[!!data_race(READ_ONCE(rnp->exp_tasks))],
+ ".G"[!!data_race(READ_ONCE(rnp->gp_tasks))],
+ data_race(READ_ONCE(rnp->n_boosts)));
if (!rcu_is_leaf_node(rnp))
continue;
for_each_leaf_node_possible_cpu(rnp, cpu) {
@@ -826,12 +853,12 @@ void show_rcu_gp_kthreads(void)
READ_ONCE(rdp->gp_seq_needed)))
continue;
pr_info("\tcpu %d ->gp_seq_needed %ld\n",
- cpu, (long)data_race(rdp->gp_seq_needed));
+ cpu, (long)data_race(READ_ONCE(rdp->gp_seq_needed)));
}
}
for_each_possible_cpu(cpu) {
rdp = per_cpu_ptr(&rcu_data, cpu);
- cbs += data_race(rdp->n_cbs_invoked);
+ cbs += data_race(READ_ONCE(rdp->n_cbs_invoked));
if (rcu_segcblist_is_offloaded(&rdp->cblist))
show_rcu_nocb_state(rdp);
}
@@ -913,11 +940,11 @@ void rcu_fwd_progress_check(unsigned long j)
if (rcu_gp_in_progress()) {
pr_info("%s: GP age %lu jiffies\n",
- __func__, jiffies - rcu_state.gp_start);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_start)));
show_rcu_gp_kthreads();
} else {
pr_info("%s: Last GP end %lu jiffies ago\n",
- __func__, jiffies - rcu_state.gp_end);
+ __func__, jiffies - data_race(READ_ONCE(rcu_state.gp_end)));
preempt_disable();
rdp = this_cpu_ptr(&rcu_data);
rcu_check_gp_start_stall(rdp->mynode, rdp, j);
diff --git a/kernel/scftorture.c b/kernel/scftorture.c
index 29e8fc5d91a7..64a08288b1a6 100644
--- a/kernel/scftorture.c
+++ b/kernel/scftorture.c
@@ -64,6 +64,7 @@ torture_param(bool, use_cpus_read_lock, 0, "Use cpus_read_lock() to exclude CPU
torture_param(int, verbose, 0, "Enable verbose debugging printk()s");
torture_param(int, weight_resched, -1, "Testing weight for resched_cpu() operations.");
torture_param(int, weight_single, -1, "Testing weight for single-CPU no-wait operations.");
+torture_param(int, weight_single_rpc, -1, "Testing weight for single-CPU RPC operations.");
torture_param(int, weight_single_wait, -1, "Testing weight for single-CPU operations.");
torture_param(int, weight_many, -1, "Testing weight for multi-CPU no-wait operations.");
torture_param(int, weight_many_wait, -1, "Testing weight for multi-CPU operations.");
@@ -86,6 +87,8 @@ struct scf_statistics {
long long n_resched;
long long n_single;
long long n_single_ofl;
+ long long n_single_rpc;
+ long long n_single_rpc_ofl;
long long n_single_wait;
long long n_single_wait_ofl;
long long n_many;
@@ -101,14 +104,17 @@ static DEFINE_PER_CPU(long long, scf_invoked_count);
// Data for random primitive selection
#define SCF_PRIM_RESCHED 0
#define SCF_PRIM_SINGLE 1
-#define SCF_PRIM_MANY 2
-#define SCF_PRIM_ALL 3
-#define SCF_NPRIMS 7 // Need wait and no-wait versions of each,
- // except for SCF_PRIM_RESCHED.
+#define SCF_PRIM_SINGLE_RPC 2
+#define SCF_PRIM_MANY 3
+#define SCF_PRIM_ALL 4
+#define SCF_NPRIMS 8 // Need wait and no-wait versions of each,
+ // except for SCF_PRIM_RESCHED and
+ // SCF_PRIM_SINGLE_RPC.
static char *scf_prim_name[] = {
"resched_cpu",
"smp_call_function_single",
+ "smp_call_function_single_rpc",
"smp_call_function_many",
"smp_call_function",
};
@@ -128,6 +134,8 @@ struct scf_check {
bool scfc_out;
int scfc_cpu; // -1 for not _single().
bool scfc_wait;
+ bool scfc_rpc;
+ struct completion scfc_completion;
};
// Use to wait for all threads to start.
@@ -158,6 +166,7 @@ static void scf_torture_stats_print(void)
scfs.n_resched += scf_stats_p[i].n_resched;
scfs.n_single += scf_stats_p[i].n_single;
scfs.n_single_ofl += scf_stats_p[i].n_single_ofl;
+ scfs.n_single_rpc += scf_stats_p[i].n_single_rpc;
scfs.n_single_wait += scf_stats_p[i].n_single_wait;
scfs.n_single_wait_ofl += scf_stats_p[i].n_single_wait_ofl;
scfs.n_many += scf_stats_p[i].n_many;
@@ -168,9 +177,10 @@ static void scf_torture_stats_print(void)
if (atomic_read(&n_errs) || atomic_read(&n_mb_in_errs) ||
atomic_read(&n_mb_out_errs) || atomic_read(&n_alloc_errs))
bangstr = "!!! ";
- pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld many: %lld/%lld all: %lld/%lld ",
+ pr_alert("%s %sscf_invoked_count %s: %lld resched: %lld single: %lld/%lld single_ofl: %lld/%lld single_rpc: %lld single_rpc_ofl: %lld many: %lld/%lld all: %lld/%lld ",
SCFTORT_FLAG, bangstr, isdone ? "VER" : "ver", invoked_count, scfs.n_resched,
scfs.n_single, scfs.n_single_wait, scfs.n_single_ofl, scfs.n_single_wait_ofl,
+ scfs.n_single_rpc, scfs.n_single_rpc_ofl,
scfs.n_many, scfs.n_many_wait, scfs.n_all, scfs.n_all_wait);
torture_onoff_stats();
pr_cont("ste: %d stnmie: %d stnmoe: %d staf: %d\n", atomic_read(&n_errs),
@@ -282,10 +292,13 @@ static void scf_handler(void *scfc_in)
out:
if (unlikely(!scfcp))
return;
- if (scfcp->scfc_wait)
+ if (scfcp->scfc_wait) {
WRITE_ONCE(scfcp->scfc_out, true);
- else
+ if (scfcp->scfc_rpc)
+ complete(&scfcp->scfc_completion);
+ } else {
kfree(scfcp);
+ }
}
// As above, but check for correct CPU.
@@ -319,6 +332,7 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfcp->scfc_cpu = -1;
scfcp->scfc_wait = scfsp->scfs_wait;
scfcp->scfc_out = false;
+ scfcp->scfc_rpc = false;
}
}
switch (scfsp->scfs_prim) {
@@ -350,6 +364,34 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
scfcp = NULL;
}
break;
+ case SCF_PRIM_SINGLE_RPC:
+ if (!scfcp)
+ break;
+ cpu = torture_random(trsp) % nr_cpu_ids;
+ scfp->n_single_rpc++;
+ scfcp->scfc_cpu = cpu;
+ scfcp->scfc_wait = true;
+ init_completion(&scfcp->scfc_completion);
+ scfcp->scfc_rpc = true;
+ barrier(); // Prevent race-reduction compiler optimizations.
+ scfcp->scfc_in = true;
+ ret = smp_call_function_single(cpu, scf_handler_1, (void *)scfcp, 0);
+ if (!ret) {
+ if (use_cpus_read_lock)
+ cpus_read_unlock();
+ else
+ preempt_enable();
+ wait_for_completion(&scfcp->scfc_completion);
+ if (use_cpus_read_lock)
+ cpus_read_lock();
+ else
+ preempt_disable();
+ } else {
+ scfp->n_single_rpc_ofl++;
+ kfree(scfcp);
+ scfcp = NULL;
+ }
+ break;
case SCF_PRIM_MANY:
if (scfsp->scfs_wait)
scfp->n_many_wait++;
@@ -379,10 +421,12 @@ static void scftorture_invoke_one(struct scf_statistics *scfp, struct torture_ra
}
if (scfcp && scfsp->scfs_wait) {
if (WARN_ON_ONCE((num_online_cpus() > 1 || scfsp->scfs_prim == SCF_PRIM_SINGLE) &&
- !scfcp->scfc_out))
+ !scfcp->scfc_out)) {
+ pr_warn("%s: Memory-ordering failure, scfs_prim: %d.\n", __func__, scfsp->scfs_prim);
atomic_inc(&n_mb_out_errs); // Leak rather than trash!
- else
+ } else {
kfree(scfcp);
+ }
barrier(); // Prevent race-reduction compiler optimizations.
}
if (use_cpus_read_lock)
@@ -453,8 +497,8 @@ static void
scftorture_print_module_parms(const char *tag)
{
pr_alert(SCFTORT_FLAG
- "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
- verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
+ "--- %s: verbose=%d holdoff=%d longwait=%d nthreads=%d onoff_holdoff=%d onoff_interval=%d shutdown_secs=%d stat_interval=%d stutter=%d use_cpus_read_lock=%d, weight_resched=%d, weight_single=%d, weight_single_rpc=%d, weight_single_wait=%d, weight_many=%d, weight_many_wait=%d, weight_all=%d, weight_all_wait=%d\n", tag,
+ verbose, holdoff, longwait, nthreads, onoff_holdoff, onoff_interval, shutdown, stat_interval, stutter, use_cpus_read_lock, weight_resched, weight_single, weight_single_rpc, weight_single_wait, weight_many, weight_many_wait, weight_all, weight_all_wait);
}
static void scf_cleanup_handler(void *unused)
@@ -469,7 +513,7 @@ static void scf_torture_cleanup(void)
return;
WRITE_ONCE(scfdone, true);
- if (nthreads)
+ if (nthreads && scf_stats_p)
for (i = 0; i < nthreads; i++)
torture_stop_kthread("scftorture_invoker", scf_stats_p[i].task);
else
@@ -497,6 +541,7 @@ static int __init scf_torture_init(void)
int firsterr = 0;
unsigned long weight_resched1 = weight_resched;
unsigned long weight_single1 = weight_single;
+ unsigned long weight_single_rpc1 = weight_single_rpc;
unsigned long weight_single_wait1 = weight_single_wait;
unsigned long weight_many1 = weight_many;
unsigned long weight_many_wait1 = weight_many_wait;
@@ -508,11 +553,13 @@ static int __init scf_torture_init(void)
scftorture_print_module_parms("Start of test");
- if (weight_resched == -1 && weight_single == -1 && weight_single_wait == -1 &&
+ if (weight_resched == -1 &&
+ weight_single == -1 && weight_single_rpc == -1 && weight_single_wait == -1 &&
weight_many == -1 && weight_many_wait == -1 &&
weight_all == -1 && weight_all_wait == -1) {
weight_resched1 = 2 * nr_cpu_ids;
weight_single1 = 2 * nr_cpu_ids;
+ weight_single_rpc1 = 2 * nr_cpu_ids;
weight_single_wait1 = 2 * nr_cpu_ids;
weight_many1 = 2;
weight_many_wait1 = 2;
@@ -523,6 +570,8 @@ static int __init scf_torture_init(void)
weight_resched1 = 0;
if (weight_single == -1)
weight_single1 = 0;
+ if (weight_single_rpc == -1)
+ weight_single_rpc1 = 0;
if (weight_single_wait == -1)
weight_single_wait1 = 0;
if (weight_many == -1)
@@ -534,7 +583,7 @@ static int __init scf_torture_init(void)
if (weight_all_wait == -1)
weight_all_wait1 = 0;
}
- if (weight_single1 == 0 && weight_single_wait1 == 0 &&
+ if (weight_single1 == 0 && weight_single_rpc1 == 0 && weight_single_wait1 == 0 &&
weight_many1 == 0 && weight_many_wait1 == 0 &&
weight_all1 == 0 && weight_all_wait1 == 0) {
VERBOSE_SCFTORTOUT_ERRSTRING("all zero weights makes no sense");
@@ -546,6 +595,7 @@ static int __init scf_torture_init(void)
else if (weight_resched1)
VERBOSE_SCFTORTOUT_ERRSTRING("built as module, weight_resched ignored");
scf_sel_add(weight_single1, SCF_PRIM_SINGLE, false);
+ scf_sel_add(weight_single_rpc1, SCF_PRIM_SINGLE_RPC, true);
scf_sel_add(weight_single_wait1, SCF_PRIM_SINGLE, true);
scf_sel_add(weight_many1, SCF_PRIM_MANY, false);
scf_sel_add(weight_many_wait1, SCF_PRIM_MANY, true);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c89c1d45dd0b..c4462c454ab9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -237,9 +237,30 @@ static DEFINE_MUTEX(sched_core_mutex);
static atomic_t sched_core_count;
static struct cpumask sched_core_mask;
+static void sched_core_lock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t, i = 0;
+
+ local_irq_save(*flags);
+ for_each_cpu(t, smt_mask)
+ raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
+}
+
+static void sched_core_unlock(int cpu, unsigned long *flags)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ int t;
+
+ for_each_cpu(t, smt_mask)
+ raw_spin_unlock(&cpu_rq(t)->__lock);
+ local_irq_restore(*flags);
+}
+
static void __sched_core_flip(bool enabled)
{
- int cpu, t, i;
+ unsigned long flags;
+ int cpu, t;
cpus_read_lock();
@@ -250,19 +271,12 @@ static void __sched_core_flip(bool enabled)
for_each_cpu(cpu, &sched_core_mask) {
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
- i = 0;
- local_irq_disable();
- for_each_cpu(t, smt_mask) {
- /* supports up to SMT8 */
- raw_spin_lock_nested(&cpu_rq(t)->__lock, i++);
- }
+ sched_core_lock(cpu, &flags);
for_each_cpu(t, smt_mask)
cpu_rq(t)->core_enabled = enabled;
- for_each_cpu(t, smt_mask)
- raw_spin_unlock(&cpu_rq(t)->__lock);
- local_irq_enable();
+ sched_core_unlock(cpu, &flags);
cpumask_andnot(&sched_core_mask, &sched_core_mask, smt_mask);
}
@@ -993,6 +1007,7 @@ int get_nohz_timer_target(void)
{
int i, cpu = smp_processor_id(), default_cpu = -1;
struct sched_domain *sd;
+ const struct cpumask *hk_mask;
if (housekeeping_cpu(cpu, HK_FLAG_TIMER)) {
if (!idle_cpu(cpu))
@@ -1000,10 +1015,11 @@ int get_nohz_timer_target(void)
default_cpu = cpu;
}
+ hk_mask = housekeeping_cpumask(HK_FLAG_TIMER);
+
rcu_read_lock();
for_each_domain(cpu, sd) {
- for_each_cpu_and(i, sched_domain_span(sd),
- housekeeping_cpumask(HK_FLAG_TIMER)) {
+ for_each_cpu_and(i, sched_domain_span(sd), hk_mask) {
if (cpu == i)
continue;
@@ -1619,6 +1635,23 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p)
uclamp_rq_dec_id(rq, p, clamp_id);
}
+static inline void uclamp_rq_reinc_id(struct rq *rq, struct task_struct *p,
+ enum uclamp_id clamp_id)
+{
+ if (!p->uclamp[clamp_id].active)
+ return;
+
+ uclamp_rq_dec_id(rq, p, clamp_id);
+ uclamp_rq_inc_id(rq, p, clamp_id);
+
+ /*
+ * Make sure to clear the idle flag if we've transiently reached 0
+ * active tasks on rq.
+ */
+ if (clamp_id == UCLAMP_MAX && (rq->uclamp_flags & UCLAMP_FLAG_IDLE))
+ rq->uclamp_flags &= ~UCLAMP_FLAG_IDLE;
+}
+
static inline void
uclamp_update_active(struct task_struct *p)
{
@@ -1642,12 +1675,8 @@ uclamp_update_active(struct task_struct *p)
* affecting a valid clamp bucket, the next time it's enqueued,
* it will already see the updated clamp bucket value.
*/
- for_each_clamp_id(clamp_id) {
- if (p->uclamp[clamp_id].active) {
- uclamp_rq_dec_id(rq, p, clamp_id);
- uclamp_rq_inc_id(rq, p, clamp_id);
- }
- }
+ for_each_clamp_id(clamp_id)
+ uclamp_rq_reinc_id(rq, p, clamp_id);
task_rq_unlock(rq, p, &rf);
}
@@ -2161,7 +2190,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu)
/* Non kernel threads are not allowed during either online or offline. */
if (!(p->flags & PF_KTHREAD))
- return cpu_active(cpu);
+ return cpu_active(cpu) && task_cpu_possible(cpu, p);
/* KTHREAD_IS_PER_CPU is always allowed. */
if (kthread_is_per_cpu(p))
@@ -2468,6 +2497,34 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
__do_set_cpus_allowed(p, new_mask, 0);
}
+int dup_user_cpus_ptr(struct task_struct *dst, struct task_struct *src,
+ int node)
+{
+ if (!src->user_cpus_ptr)
+ return 0;
+
+ dst->user_cpus_ptr = kmalloc_node(cpumask_size(), GFP_KERNEL, node);
+ if (!dst->user_cpus_ptr)
+ return -ENOMEM;
+
+ cpumask_copy(dst->user_cpus_ptr, src->user_cpus_ptr);
+ return 0;
+}
+
+static inline struct cpumask *clear_user_cpus_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = NULL;
+
+ swap(p->user_cpus_ptr, user_mask);
+
+ return user_mask;
+}
+
+void release_user_cpus_ptr(struct task_struct *p)
+{
+ kfree(clear_user_cpus_ptr(p));
+}
+
/*
* This function is wildly self concurrent; here be dragons.
*
@@ -2685,28 +2742,26 @@ static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flag
}
/*
- * Change a given task's CPU affinity. Migrate the thread to a
- * proper CPU and schedule it away if the CPU it's executing on
- * is removed from the allowed bitmask.
- *
- * NOTE: the caller must have a valid reference to the task, the
- * task must not exit() & deallocate itself prematurely. The
- * call is not atomic; no spinlocks may be held.
+ * Called with both p->pi_lock and rq->lock held; drops both before returning.
*/
-static int __set_cpus_allowed_ptr(struct task_struct *p,
- const struct cpumask *new_mask,
- u32 flags)
+static int __set_cpus_allowed_ptr_locked(struct task_struct *p,
+ const struct cpumask *new_mask,
+ u32 flags,
+ struct rq *rq,
+ struct rq_flags *rf)
+ __releases(rq->lock)
+ __releases(p->pi_lock)
{
+ const struct cpumask *cpu_allowed_mask = task_cpu_possible_mask(p);
const struct cpumask *cpu_valid_mask = cpu_active_mask;
+ bool kthread = p->flags & PF_KTHREAD;
+ struct cpumask *user_mask = NULL;
unsigned int dest_cpu;
- struct rq_flags rf;
- struct rq *rq;
int ret = 0;
- rq = task_rq_lock(p, &rf);
update_rq_clock(rq);
- if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
+ if (kthread || is_migration_disabled(p)) {
/*
* Kernel threads are allowed on online && !active CPUs,
* however, during cpu-hot-unplug, even these might get pushed
@@ -2720,6 +2775,11 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
cpu_valid_mask = cpu_online_mask;
}
+ if (!kthread && !cpumask_subset(new_mask, cpu_allowed_mask)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
/*
* Must re-check here, to close a race against __kthread_bind(),
* sched_setaffinity() is not guaranteed to observe the flag.
@@ -2754,20 +2814,178 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
__do_set_cpus_allowed(p, new_mask, flags);
- return affine_move_task(rq, p, &rf, dest_cpu, flags);
+ if (flags & SCA_USER)
+ user_mask = clear_user_cpus_ptr(p);
+
+ ret = affine_move_task(rq, p, rf, dest_cpu, flags);
+
+ kfree(user_mask);
+
+ return ret;
out:
- task_rq_unlock(rq, p, &rf);
+ task_rq_unlock(rq, p, rf);
return ret;
}
+/*
+ * Change a given task's CPU affinity. Migrate the thread to a
+ * proper CPU and schedule it away if the CPU it's executing on
+ * is removed from the allowed bitmask.
+ *
+ * NOTE: the caller must have a valid reference to the task, the
+ * task must not exit() & deallocate itself prematurely. The
+ * call is not atomic; no spinlocks may be held.
+ */
+static int __set_cpus_allowed_ptr(struct task_struct *p,
+ const struct cpumask *new_mask, u32 flags)
+{
+ struct rq_flags rf;
+ struct rq *rq;
+
+ rq = task_rq_lock(p, &rf);
+ return __set_cpus_allowed_ptr_locked(p, new_mask, flags, rq, &rf);
+}
+
int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
{
return __set_cpus_allowed_ptr(p, new_mask, 0);
}
EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
+/*
+ * Change a given task's CPU affinity to the intersection of its current
+ * affinity mask and @subset_mask, writing the resulting mask to @new_mask
+ * and pointing @p->user_cpus_ptr to a copy of the old mask.
+ * If the resulting mask is empty, leave the affinity unchanged and return
+ * -EINVAL.
+ */
+static int restrict_cpus_allowed_ptr(struct task_struct *p,
+ struct cpumask *new_mask,
+ const struct cpumask *subset_mask)
+{
+ struct cpumask *user_mask = NULL;
+ struct rq_flags rf;
+ struct rq *rq;
+ int err;
+
+ if (!p->user_cpus_ptr) {
+ user_mask = kmalloc(cpumask_size(), GFP_KERNEL);
+ if (!user_mask)
+ return -ENOMEM;
+ }
+
+ rq = task_rq_lock(p, &rf);
+
+ /*
+ * Forcefully restricting the affinity of a deadline task is
+ * likely to cause problems, so fail and noisily override the
+ * mask entirely.
+ */
+ if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
+ err = -EPERM;
+ goto err_unlock;
+ }
+
+ if (!cpumask_and(new_mask, &p->cpus_mask, subset_mask)) {
+ err = -EINVAL;
+ goto err_unlock;
+ }
+
+ /*
+ * We're about to butcher the task affinity, so keep track of what
+ * the user asked for in case we're able to restore it later on.
+ */
+ if (user_mask) {
+ cpumask_copy(user_mask, p->cpus_ptr);
+ p->user_cpus_ptr = user_mask;
+ }
+
+ return __set_cpus_allowed_ptr_locked(p, new_mask, 0, rq, &rf);
+
+err_unlock:
+ task_rq_unlock(rq, p, &rf);
+ kfree(user_mask);
+ return err;
+}
+
+/*
+ * Restrict the CPU affinity of task @p so that it is a subset of
+ * task_cpu_possible_mask() and point @p->user_cpu_ptr to a copy of the
+ * old affinity mask. If the resulting mask is empty, we warn and walk
+ * up the cpuset hierarchy until we find a suitable mask.
+ */
+void force_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ cpumask_var_t new_mask;
+ const struct cpumask *override_mask = task_cpu_possible_mask(p);
+
+ alloc_cpumask_var(&new_mask, GFP_KERNEL);
+
+ /*
+ * __migrate_task() can fail silently in the face of concurrent
+ * offlining of the chosen destination CPU, so take the hotplug
+ * lock to ensure that the migration succeeds.
+ */
+ cpus_read_lock();
+ if (!cpumask_available(new_mask))
+ goto out_set_mask;
+
+ if (!restrict_cpus_allowed_ptr(p, new_mask, override_mask))
+ goto out_free_mask;
+
+ /*
+ * We failed to find a valid subset of the affinity mask for the
+ * task, so override it based on its cpuset hierarchy.
+ */
+ cpuset_cpus_allowed(p, new_mask);
+ override_mask = new_mask;
+
+out_set_mask:
+ if (printk_ratelimit()) {
+ printk_deferred("Overriding affinity for process %d (%s) to CPUs %*pbl\n",
+ task_pid_nr(p), p->comm,
+ cpumask_pr_args(override_mask));
+ }
+
+ WARN_ON(set_cpus_allowed_ptr(p, override_mask));
+out_free_mask:
+ cpus_read_unlock();
+ free_cpumask_var(new_mask);
+}
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask);
+
+/*
+ * Restore the affinity of a task @p which was previously restricted by a
+ * call to force_compatible_cpus_allowed_ptr(). This will clear (and free)
+ * @p->user_cpus_ptr.
+ *
+ * It is the caller's responsibility to serialise this with any calls to
+ * force_compatible_cpus_allowed_ptr(@p).
+ */
+void relax_compatible_cpus_allowed_ptr(struct task_struct *p)
+{
+ struct cpumask *user_mask = p->user_cpus_ptr;
+ unsigned long flags;
+
+ /*
+ * Try to restore the old affinity mask. If this fails, then
+ * we free the mask explicitly to avoid it being inherited across
+ * a subsequent fork().
+ */
+ if (!user_mask || !__sched_setaffinity(p, user_mask))
+ return;
+
+ raw_spin_lock_irqsave(&p->pi_lock, flags);
+ user_mask = clear_user_cpus_ptr(p);
+ raw_spin_unlock_irqrestore(&p->pi_lock, flags);
+
+ kfree(user_mask);
+}
+
void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
{
#ifdef CONFIG_SCHED_DEBUG
@@ -3112,9 +3330,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* Look for allowed, online CPU in same node. */
for_each_cpu(dest_cpu, nodemask) {
- if (!cpu_active(dest_cpu))
- continue;
- if (cpumask_test_cpu(dest_cpu, p->cpus_ptr))
+ if (is_cpu_allowed(p, dest_cpu))
return dest_cpu;
}
}
@@ -3131,8 +3347,7 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
/* No more Mr. Nice Guy. */
switch (state) {
case cpuset:
- if (IS_ENABLED(CONFIG_CPUSETS)) {
- cpuset_cpus_allowed_fallback(p);
+ if (cpuset_cpus_allowed_fallback(p)) {
state = possible;
break;
}
@@ -3144,10 +3359,9 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
*
* More yuck to audit.
*/
- do_set_cpus_allowed(p, cpu_possible_mask);
+ do_set_cpus_allowed(p, task_cpu_possible_mask(p));
state = fail;
break;
-
case fail:
BUG();
break;
@@ -5705,11 +5919,9 @@ static bool try_steal_cookie(int this, int that)
if (p->core_occupation > dst->idle->core_occupation)
goto next;
- p->on_rq = TASK_ON_RQ_MIGRATING;
deactivate_task(src, p, 0);
set_task_cpu(p, this);
activate_task(dst, p, 0);
- p->on_rq = TASK_ON_RQ_QUEUED;
resched_curr(dst);
@@ -5781,35 +5993,109 @@ void queue_core_balance(struct rq *rq)
queue_balance_callback(rq, &per_cpu(core_balance_head, rq->cpu), sched_core_balance);
}
-static inline void sched_core_cpu_starting(unsigned int cpu)
+static void sched_core_cpu_starting(unsigned int cpu)
{
const struct cpumask *smt_mask = cpu_smt_mask(cpu);
- struct rq *rq, *core_rq = NULL;
- int i;
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ unsigned long flags;
+ int t;
+
+ sched_core_lock(cpu, &flags);
- core_rq = cpu_rq(cpu)->core;
+ WARN_ON_ONCE(rq->core != rq);
+
+ /* if we're the first, we'll be our own leader */
+ if (cpumask_weight(smt_mask) == 1)
+ goto unlock;
- if (!core_rq) {
- for_each_cpu(i, smt_mask) {
- rq = cpu_rq(i);
- if (rq->core && rq->core == rq)
- core_rq = rq;
+ /* find the leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ rq = cpu_rq(t);
+ if (rq->core == rq) {
+ core_rq = rq;
+ break;
}
+ }
- if (!core_rq)
- core_rq = cpu_rq(cpu);
+ if (WARN_ON_ONCE(!core_rq)) /* whoopsie */
+ goto unlock;
- for_each_cpu(i, smt_mask) {
- rq = cpu_rq(i);
+ /* install and validate core_rq */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
- WARN_ON_ONCE(rq->core && rq->core != core_rq);
+ if (t == cpu)
rq->core = core_rq;
- }
+
+ WARN_ON_ONCE(rq->core != core_rq);
}
+
+unlock:
+ sched_core_unlock(cpu, &flags);
}
+
+static void sched_core_cpu_deactivate(unsigned int cpu)
+{
+ const struct cpumask *smt_mask = cpu_smt_mask(cpu);
+ struct rq *rq = cpu_rq(cpu), *core_rq = NULL;
+ unsigned long flags;
+ int t;
+
+ sched_core_lock(cpu, &flags);
+
+ /* if we're the last man standing, nothing to do */
+ if (cpumask_weight(smt_mask) == 1) {
+ WARN_ON_ONCE(rq->core != rq);
+ goto unlock;
+ }
+
+ /* if we're not the leader, nothing to do */
+ if (rq->core != rq)
+ goto unlock;
+
+ /* find a new leader */
+ for_each_cpu(t, smt_mask) {
+ if (t == cpu)
+ continue;
+ core_rq = cpu_rq(t);
+ break;
+ }
+
+ if (WARN_ON_ONCE(!core_rq)) /* impossible */
+ goto unlock;
+
+ /* copy the shared state to the new leader */
+ core_rq->core_task_seq = rq->core_task_seq;
+ core_rq->core_pick_seq = rq->core_pick_seq;
+ core_rq->core_cookie = rq->core_cookie;
+ core_rq->core_forceidle = rq->core_forceidle;
+ core_rq->core_forceidle_seq = rq->core_forceidle_seq;
+
+ /* install new leader */
+ for_each_cpu(t, smt_mask) {
+ rq = cpu_rq(t);
+ rq->core = core_rq;
+ }
+
+unlock:
+ sched_core_unlock(cpu, &flags);
+}
+
+static inline void sched_core_cpu_dying(unsigned int cpu)
+{
+ struct rq *rq = cpu_rq(cpu);
+
+ if (rq->core != rq)
+ rq->core = rq;
+}
+
#else /* !CONFIG_SCHED_CORE */
static inline void sched_core_cpu_starting(unsigned int cpu) {}
+static inline void sched_core_cpu_deactivate(unsigned int cpu) {}
+static inline void sched_core_cpu_dying(unsigned int cpu) {}
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
@@ -7375,6 +7661,16 @@ err_size:
return -E2BIG;
}
+static void get_params(struct task_struct *p, struct sched_attr *attr)
+{
+ if (task_has_dl_policy(p))
+ __getparam_dl(p, attr);
+ else if (task_has_rt_policy(p))
+ attr->sched_priority = p->rt_priority;
+ else
+ attr->sched_nice = task_nice(p);
+}
+
/**
* sys_sched_setscheduler - set/change the scheduler policy and RT priority
* @pid: the pid in question.
@@ -7436,6 +7732,8 @@ SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
rcu_read_unlock();
if (likely(p)) {
+ if (attr.sched_flags & SCHED_FLAG_KEEP_PARAMS)
+ get_params(p, &attr);
retval = sched_setattr(p, &attr);
put_task_struct(p);
}
@@ -7584,12 +7882,8 @@ SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
kattr.sched_policy = p->policy;
if (p->sched_reset_on_fork)
kattr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
- if (task_has_dl_policy(p))
- __getparam_dl(p, &kattr);
- else if (task_has_rt_policy(p))
- kattr.sched_priority = p->rt_priority;
- else
- kattr.sched_nice = task_nice(p);
+ get_params(p, &kattr);
+ kattr.sched_flags &= SCHED_FLAG_ALL;
#ifdef CONFIG_UCLAMP_TASK
/*
@@ -7610,9 +7904,76 @@ out_unlock:
return retval;
}
-long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+#ifdef CONFIG_SMP
+int dl_task_check_affinity(struct task_struct *p, const struct cpumask *mask)
{
+ int ret = 0;
+
+ /*
+ * If the task isn't a deadline task or admission control is
+ * disabled then we don't care about affinity changes.
+ */
+ if (!task_has_dl_policy(p) || !dl_bandwidth_enabled())
+ return 0;
+
+ /*
+ * Since bandwidth control happens on root_domain basis,
+ * if admission test is enabled, we only admit -deadline
+ * tasks allowed to run on all the CPUs in the task's
+ * root_domain.
+ */
+ rcu_read_lock();
+ if (!cpumask_subset(task_rq(p)->rd->span, mask))
+ ret = -EBUSY;
+ rcu_read_unlock();
+ return ret;
+}
+#endif
+
+static int
+__sched_setaffinity(struct task_struct *p, const struct cpumask *mask)
+{
+ int retval;
cpumask_var_t cpus_allowed, new_mask;
+
+ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL))
+ return -ENOMEM;
+
+ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
+ retval = -ENOMEM;
+ goto out_free_cpus_allowed;
+ }
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ cpumask_and(new_mask, mask, cpus_allowed);
+
+ retval = dl_task_check_affinity(p, new_mask);
+ if (retval)
+ goto out_free_new_mask;
+again:
+ retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK | SCA_USER);
+ if (retval)
+ goto out_free_new_mask;
+
+ cpuset_cpus_allowed(p, cpus_allowed);
+ if (!cpumask_subset(new_mask, cpus_allowed)) {
+ /*
+ * We must have raced with a concurrent cpuset update.
+ * Just reset the cpumask to the cpuset's cpus_allowed.
+ */
+ cpumask_copy(new_mask, cpus_allowed);
+ goto again;
+ }
+
+out_free_new_mask:
+ free_cpumask_var(new_mask);
+out_free_cpus_allowed:
+ free_cpumask_var(cpus_allowed);
+ return retval;
+}
+
+long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
+{
struct task_struct *p;
int retval;
@@ -7632,68 +7993,22 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
retval = -EINVAL;
goto out_put_task;
}
- if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_put_task;
- }
- if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
- retval = -ENOMEM;
- goto out_free_cpus_allowed;
- }
- retval = -EPERM;
+
if (!check_same_owner(p)) {
rcu_read_lock();
if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
rcu_read_unlock();
- goto out_free_new_mask;
+ retval = -EPERM;
+ goto out_put_task;
}
rcu_read_unlock();
}
retval = security_task_setscheduler(p);
if (retval)
- goto out_free_new_mask;
-
-
- cpuset_cpus_allowed(p, cpus_allowed);
- cpumask_and(new_mask, in_mask, cpus_allowed);
-
- /*
- * Since bandwidth control happens on root_domain basis,
- * if admission test is enabled, we only admit -deadline
- * tasks allowed to run on all the CPUs in the task's
- * root_domain.
- */
-#ifdef CONFIG_SMP
- if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
- rcu_read_lock();
- if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
- retval = -EBUSY;
- rcu_read_unlock();
- goto out_free_new_mask;
- }
- rcu_read_unlock();
- }
-#endif
-again:
- retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK);
+ goto out_put_task;
- if (!retval) {
- cpuset_cpus_allowed(p, cpus_allowed);
- if (!cpumask_subset(new_mask, cpus_allowed)) {
- /*
- * We must have raced with a concurrent cpuset
- * update. Just reset the cpus_allowed to the
- * cpuset's cpus_allowed
- */
- cpumask_copy(new_mask, cpus_allowed);
- goto again;
- }
- }
-out_free_new_mask:
- free_cpumask_var(new_mask);
-out_free_cpus_allowed:
- free_cpumask_var(cpus_allowed);
+ retval = __sched_setaffinity(p, in_mask);
out_put_task:
put_task_struct(p);
return retval;
@@ -7836,6 +8151,17 @@ int __sched __cond_resched(void)
preempt_schedule_common();
return 1;
}
+ /*
+ * In preemptible kernels, ->rcu_read_lock_nesting tells the tick
+ * whether the current CPU is in an RCU read-side critical section,
+ * so the tick can report quiescent states even for CPUs looping
+ * in kernel context. In contrast, in non-preemptible kernels,
+ * RCU readers leave no in-memory hints, which means that CPU-bound
+ * processes executing in kernel context might never report an
+ * RCU quiescent state. Therefore, the following code causes
+ * cond_resched() to report a quiescent state, but only when RCU
+ * is in urgent need of one.
+ */
#ifndef CONFIG_PREEMPT_RCU
rcu_all_qs();
#endif
@@ -8782,6 +9108,8 @@ int sched_cpu_deactivate(unsigned int cpu)
*/
if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
static_branch_dec_cpuslocked(&sched_smt_present);
+
+ sched_core_cpu_deactivate(cpu);
#endif
if (!sched_smp_initialized)
@@ -8886,6 +9214,7 @@ int sched_cpu_dying(unsigned int cpu)
calc_load_migrate(rq);
update_max_interval();
hrtick_clear(rq);
+ sched_core_cpu_dying(cpu);
return 0;
}
#endif
@@ -9097,7 +9426,7 @@ void __init sched_init(void)
atomic_set(&rq->nr_iowait, 0);
#ifdef CONFIG_SCHED_CORE
- rq->core = NULL;
+ rq->core = rq;
rq->core_pick = NULL;
rq->core_enabled = 0;
rq->core_tree = RB_ROOT;
@@ -9879,7 +10208,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
* Prevent race between setting of cfs_rq->runtime_enabled and
* unthrottle_offline_cfs_rqs().
*/
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&cfs_constraints_mutex);
ret = __cfs_schedulable(tg, period, quota);
if (ret)
@@ -9923,7 +10252,7 @@ static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota,
cfs_bandwidth_usage_dec();
out_unlock:
mutex_unlock(&cfs_constraints_mutex);
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
@@ -10174,6 +10503,20 @@ static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
}
#endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static s64 cpu_idle_read_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft)
+{
+ return css_tg(css)->idle;
+}
+
+static int cpu_idle_write_s64(struct cgroup_subsys_state *css,
+ struct cftype *cft, s64 idle)
+{
+ return sched_group_set_idle(css_tg(css), idle);
+}
+#endif
+
static struct cftype cpu_legacy_files[] = {
#ifdef CONFIG_FAIR_GROUP_SCHED
{
@@ -10181,6 +10524,11 @@ static struct cftype cpu_legacy_files[] = {
.read_u64 = cpu_shares_read_u64,
.write_u64 = cpu_shares_write_u64,
},
+ {
+ .name = "idle",
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
@@ -10388,6 +10736,12 @@ static struct cftype cpu_files[] = {
.read_s64 = cpu_weight_nice_read_s64,
.write_s64 = cpu_weight_nice_write_s64,
},
+ {
+ .name = "idle",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .read_s64 = cpu_idle_read_s64,
+ .write_s64 = cpu_idle_write_s64,
+ },
#endif
#ifdef CONFIG_CFS_BANDWIDTH
{
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index aaacd6cfd42f..e94314633b39 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1733,6 +1733,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
*/
raw_spin_rq_lock(rq);
if (p->dl.dl_non_contending) {
+ update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
p->dl.dl_non_contending = 0;
/*
@@ -2741,7 +2742,7 @@ void __setparam_dl(struct task_struct *p, const struct sched_attr *attr)
dl_se->dl_runtime = attr->sched_runtime;
dl_se->dl_deadline = attr->sched_deadline;
dl_se->dl_period = attr->sched_period ?: dl_se->dl_deadline;
- dl_se->flags = attr->sched_flags;
+ dl_se->flags = attr->sched_flags & SCHED_DL_FLAGS;
dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime);
dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime);
}
@@ -2754,7 +2755,8 @@ void __getparam_dl(struct task_struct *p, struct sched_attr *attr)
attr->sched_runtime = dl_se->dl_runtime;
attr->sched_deadline = dl_se->dl_deadline;
attr->sched_period = dl_se->dl_period;
- attr->sched_flags = dl_se->flags;
+ attr->sched_flags &= ~SCHED_DL_FLAGS;
+ attr->sched_flags |= dl_se->flags;
}
/*
@@ -2851,7 +2853,7 @@ bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr)
if (dl_se->dl_runtime != attr->sched_runtime ||
dl_se->dl_deadline != attr->sched_deadline ||
dl_se->dl_period != attr->sched_period ||
- dl_se->flags != attr->sched_flags)
+ dl_se->flags != (attr->sched_flags & SCHED_DL_FLAGS))
return true;
return false;
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c
index 0c5ec2776ddf..49716228efb4 100644
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -388,6 +388,13 @@ void update_sched_domain_debugfs(void)
{
int cpu, i;
+ /*
+ * This can unfortunately be invoked before sched_debug_init() creates
+ * the debug directory. Don't touch sd_sysctl_cpus until then.
+ */
+ if (!debugfs_sched)
+ return;
+
if (!cpumask_available(sd_sysctl_cpus)) {
if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL))
return;
@@ -600,6 +607,9 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over",
cfs_rq->nr_spread_over);
SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running);
+ SEQ_printf(m, " .%-30s: %d\n", "idle_h_nr_running",
+ cfs_rq->idle_h_nr_running);
SEQ_printf(m, " .%-30s: %ld\n", "load", cfs_rq->load.weight);
#ifdef CONFIG_SMP
SEQ_printf(m, " .%-30s: %lu\n", "load_avg",
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 44c452072a1b..ff69f245b939 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -431,6 +431,23 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
}
}
+static int tg_is_idle(struct task_group *tg)
+{
+ return tg->idle > 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+ return cfs_rq->idle > 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+ if (entity_is_task(se))
+ return task_has_idle_policy(task_of(se));
+ return cfs_rq_is_idle(group_cfs_rq(se));
+}
+
#else /* !CONFIG_FAIR_GROUP_SCHED */
#define for_each_sched_entity(se) \
@@ -468,6 +485,21 @@ find_matching_se(struct sched_entity **se, struct sched_entity **pse)
{
}
+static inline int tg_is_idle(struct task_group *tg)
+{
+ return 0;
+}
+
+static int cfs_rq_is_idle(struct cfs_rq *cfs_rq)
+{
+ return 0;
+}
+
+static int se_is_idle(struct sched_entity *se)
+{
+ return 0;
+}
+
#endif /* CONFIG_FAIR_GROUP_SCHED */
static __always_inline
@@ -1486,7 +1518,7 @@ static inline bool is_core_idle(int cpu)
if (cpu == sibling)
continue;
- if (!idle_cpu(cpu))
+ if (!idle_cpu(sibling))
return false;
}
#endif
@@ -4841,6 +4873,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
@@ -4860,6 +4895,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
update_load_avg(qcfs_rq, se, 0);
se_update_runnable(se);
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
}
@@ -4904,39 +4942,45 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
task_delta = cfs_rq->h_nr_running;
idle_task_delta = cfs_rq->idle_h_nr_running;
for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+
if (se->on_rq)
break;
- cfs_rq = cfs_rq_of(se);
- enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
+ enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP);
+
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
}
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- update_load_avg(cfs_rq, se, UPDATE_TG);
+ update_load_avg(qcfs_rq, se, UPDATE_TG);
se_update_runnable(se);
- cfs_rq->h_nr_running += task_delta;
- cfs_rq->idle_h_nr_running += idle_task_delta;
+ if (cfs_rq_is_idle(group_cfs_rq(se)))
+ idle_task_delta = cfs_rq->h_nr_running;
+ qcfs_rq->h_nr_running += task_delta;
+ qcfs_rq->idle_h_nr_running += idle_task_delta;
/* end evaluation on encountering a throttled cfs_rq */
- if (cfs_rq_throttled(cfs_rq))
+ if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
/*
* One parent has been throttled and cfs_rq removed from the
* list. Add it back to not break the leaf list.
*/
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
+ if (throttled_hierarchy(qcfs_rq))
+ list_add_leaf_cfs_rq(qcfs_rq);
}
/* At this point se is NULL and we are at root level*/
@@ -4949,9 +4993,9 @@ unthrottle_throttle:
* assertion below.
*/
for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
- if (list_add_leaf_cfs_rq(cfs_rq))
+ if (list_add_leaf_cfs_rq(qcfs_rq))
break;
}
@@ -5574,6 +5618,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
@@ -5591,6 +5638,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running++;
cfs_rq->idle_h_nr_running += idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
@@ -5668,6 +5718,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
@@ -5697,6 +5750,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
cfs_rq->h_nr_running--;
cfs_rq->idle_h_nr_running -= idle_h_nr_running;
+ if (cfs_rq_is_idle(cfs_rq))
+ idle_h_nr_running = 1;
+
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto dequeue_throttle;
@@ -6249,7 +6305,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool
time = cpu_clock(this);
}
- for_each_cpu_wrap(cpu, cpus, target) {
+ for_each_cpu_wrap(cpu, cpus, target + 1) {
if (has_idle_core) {
i = select_idle_core(p, cpu, cpus, &idle_cpu);
if ((unsigned int)i < nr_cpumask_bits)
@@ -6376,6 +6432,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
/* Check a recently used CPU as a potential idle candidate: */
recent_used_cpu = p->recent_used_cpu;
+ p->recent_used_cpu = prev;
if (recent_used_cpu != prev &&
recent_used_cpu != target &&
cpus_share_cache(recent_used_cpu, target) &&
@@ -6902,9 +6959,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags)
} else if (wake_flags & WF_TTWU) { /* XXX always ? */
/* Fast path */
new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
-
- if (want_affine)
- current->recent_used_cpu = cpu;
}
rcu_read_unlock();
@@ -7041,24 +7095,22 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
static void set_last_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->last = se;
}
}
static void set_next_buddy(struct sched_entity *se)
{
- if (entity_is_task(se) && unlikely(task_has_idle_policy(task_of(se))))
- return;
-
for_each_sched_entity(se) {
if (SCHED_WARN_ON(!se->on_rq))
return;
+ if (se_is_idle(se))
+ return;
cfs_rq_of(se)->next = se;
}
}
@@ -7079,6 +7131,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
struct cfs_rq *cfs_rq = task_cfs_rq(curr);
int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
+ int cse_is_idle, pse_is_idle;
if (unlikely(se == pse))
return;
@@ -7123,8 +7176,21 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
return;
find_matching_se(&se, &pse);
- update_curr(cfs_rq_of(se));
BUG_ON(!pse);
+
+ cse_is_idle = se_is_idle(se);
+ pse_is_idle = se_is_idle(pse);
+
+ /*
+ * Preempt an idle group in favor of a non-idle group (and don't preempt
+ * in the inverse case).
+ */
+ if (cse_is_idle && !pse_is_idle)
+ goto preempt;
+ if (cse_is_idle != pse_is_idle)
+ return;
+
+ update_curr(cfs_rq_of(se));
if (wakeup_preempt_entity(se, pse) == 1) {
/*
* Bias pick_next to pick the sched entity that is
@@ -10217,9 +10283,11 @@ static inline int on_null_domain(struct rq *rq)
static inline int find_new_ilb(void)
{
int ilb;
+ const struct cpumask *hk_mask;
+
+ hk_mask = housekeeping_cpumask(HK_FLAG_MISC);
- for_each_cpu_and(ilb, nohz.idle_cpus_mask,
- housekeeping_cpumask(HK_FLAG_MISC)) {
+ for_each_cpu_and(ilb, nohz.idle_cpus_mask, hk_mask) {
if (ilb == smp_processor_id())
continue;
@@ -11416,10 +11484,12 @@ void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
static DEFINE_MUTEX(shares_mutex);
-int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+static int __sched_group_set_shares(struct task_group *tg, unsigned long shares)
{
int i;
+ lockdep_assert_held(&shares_mutex);
+
/*
* We can't change the weight of the root cgroup.
*/
@@ -11428,9 +11498,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
- mutex_lock(&shares_mutex);
if (tg->shares == shares)
- goto done;
+ return 0;
tg->shares = shares;
for_each_possible_cpu(i) {
@@ -11448,10 +11517,88 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
rq_unlock_irqrestore(rq, &rf);
}
-done:
+ return 0;
+}
+
+int sched_group_set_shares(struct task_group *tg, unsigned long shares)
+{
+ int ret;
+
+ mutex_lock(&shares_mutex);
+ if (tg_is_idle(tg))
+ ret = -EINVAL;
+ else
+ ret = __sched_group_set_shares(tg, shares);
+ mutex_unlock(&shares_mutex);
+
+ return ret;
+}
+
+int sched_group_set_idle(struct task_group *tg, long idle)
+{
+ int i;
+
+ if (tg == &root_task_group)
+ return -EINVAL;
+
+ if (idle < 0 || idle > 1)
+ return -EINVAL;
+
+ mutex_lock(&shares_mutex);
+
+ if (tg->idle == idle) {
+ mutex_unlock(&shares_mutex);
+ return 0;
+ }
+
+ tg->idle = idle;
+
+ for_each_possible_cpu(i) {
+ struct rq *rq = cpu_rq(i);
+ struct sched_entity *se = tg->se[i];
+ struct cfs_rq *grp_cfs_rq = tg->cfs_rq[i];
+ bool was_idle = cfs_rq_is_idle(grp_cfs_rq);
+ long idle_task_delta;
+ struct rq_flags rf;
+
+ rq_lock_irqsave(rq, &rf);
+
+ grp_cfs_rq->idle = idle;
+ if (WARN_ON_ONCE(was_idle == cfs_rq_is_idle(grp_cfs_rq)))
+ goto next_cpu;
+
+ idle_task_delta = grp_cfs_rq->h_nr_running -
+ grp_cfs_rq->idle_h_nr_running;
+ if (!cfs_rq_is_idle(grp_cfs_rq))
+ idle_task_delta *= -1;
+
+ for_each_sched_entity(se) {
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (!se->on_rq)
+ break;
+
+ cfs_rq->idle_h_nr_running += idle_task_delta;
+
+ /* Already accounted at parent level and above. */
+ if (cfs_rq_is_idle(cfs_rq))
+ break;
+ }
+
+next_cpu:
+ rq_unlock_irqrestore(rq, &rf);
+ }
+
+ /* Idle groups have minimum weight. */
+ if (tg_is_idle(tg))
+ __sched_group_set_shares(tg, scale_load(WEIGHT_IDLEPRIO));
+ else
+ __sched_group_set_shares(tg, NICE_0_LOAD);
+
mutex_unlock(&shares_mutex);
return 0;
}
+
#else /* CONFIG_FAIR_GROUP_SCHED */
void free_fair_sched_group(struct task_group *tg) { }
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 14a41a243f7b..3d3e5793e117 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -227,6 +227,8 @@ static inline void update_avg(u64 *avg, u64 sample)
*/
#define SCHED_FLAG_SUGOV 0x10000000
+#define SCHED_DL_FLAGS (SCHED_FLAG_RECLAIM | SCHED_FLAG_DL_OVERRUN | SCHED_FLAG_SUGOV)
+
static inline bool dl_entity_is_special(struct sched_dl_entity *dl_se)
{
#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL
@@ -394,6 +396,9 @@ struct task_group {
struct cfs_rq **cfs_rq;
unsigned long shares;
+ /* A positive value indicates that this is a SCHED_IDLE group. */
+ int idle;
+
#ifdef CONFIG_SMP
/*
* load_avg can be heavily contended at clock tick time, so put
@@ -503,6 +508,8 @@ extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern int sched_group_set_idle(struct task_group *tg, long idle);
+
#ifdef CONFIG_SMP
extern void set_task_rq_fair(struct sched_entity *se,
struct cfs_rq *prev, struct cfs_rq *next);
@@ -599,6 +606,9 @@ struct cfs_rq {
struct list_head leaf_cfs_rq_list;
struct task_group *tg; /* group that "owns" this runqueue */
+ /* Locally cached copy of our task_group's idle value */
+ int idle;
+
#ifdef CONFIG_CFS_BANDWIDTH
int runtime_enabled;
s64 runtime_remaining;
@@ -1093,7 +1103,7 @@ struct rq {
unsigned int core_sched_seq;
struct rb_root core_tree;
- /* shared state */
+ /* shared state -- careful with sched_core_cpu_deactivate() */
unsigned int core_task_seq;
unsigned int core_pick_seq;
unsigned long core_cookie;
@@ -2234,6 +2244,7 @@ extern struct task_struct *pick_next_task_idle(struct rq *rq);
#define SCA_CHECK 0x01
#define SCA_MIGRATE_DISABLE 0x02
#define SCA_MIGRATE_ENABLE 0x04
+#define SCA_USER 0x08
#ifdef CONFIG_SMP
@@ -2255,6 +2266,9 @@ static inline struct task_struct *get_push_task(struct rq *rq)
if (p->nr_cpus_allowed == 1)
return NULL;
+ if (p->migration_disabled)
+ return NULL;
+
rq->push_busy = true;
return get_task_struct(p);
}
@@ -2385,6 +2399,21 @@ extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags);
extern const_debug unsigned int sysctl_sched_nr_migrate;
extern const_debug unsigned int sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_DEBUG
+extern unsigned int sysctl_sched_latency;
+extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_wakeup_granularity;
+extern int sysctl_resched_latency_warn_ms;
+extern int sysctl_resched_latency_warn_once;
+
+extern unsigned int sysctl_sched_tunable_scaling;
+
+extern unsigned int sysctl_numa_balancing_scan_delay;
+extern unsigned int sysctl_numa_balancing_scan_period_min;
+extern unsigned int sysctl_numa_balancing_scan_period_max;
+extern unsigned int sysctl_numa_balancing_scan_size;
+#endif
+
#ifdef CONFIG_SCHED_HRTICK
/*
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index b77ad49dc14f..4e8698e62f07 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -1482,6 +1482,8 @@ int sched_max_numa_distance;
static int *sched_domains_numa_distance;
static struct cpumask ***sched_domains_numa_masks;
int __read_mostly node_reclaim_distance = RECLAIM_DISTANCE;
+
+static unsigned long __read_mostly *sched_numa_onlined_nodes;
#endif
/*
@@ -1833,6 +1835,16 @@ void sched_init_numa(void)
sched_domains_numa_masks[i][j] = mask;
for_each_node(k) {
+ /*
+ * Distance information can be unreliable for
+ * offline nodes, defer building the node
+ * masks to its bringup.
+ * This relies on all unique distance values
+ * still being visible at init time.
+ */
+ if (!node_online(j))
+ continue;
+
if (sched_debug() && (node_distance(j, k) != node_distance(k, j)))
sched_numa_warn("Node-distance not symmetric");
@@ -1886,6 +1898,53 @@ void sched_init_numa(void)
sched_max_numa_distance = sched_domains_numa_distance[nr_levels - 1];
init_numa_topology_type();
+
+ sched_numa_onlined_nodes = bitmap_alloc(nr_node_ids, GFP_KERNEL);
+ if (!sched_numa_onlined_nodes)
+ return;
+
+ bitmap_zero(sched_numa_onlined_nodes, nr_node_ids);
+ for_each_online_node(i)
+ bitmap_set(sched_numa_onlined_nodes, i, 1);
+}
+
+static void __sched_domains_numa_masks_set(unsigned int node)
+{
+ int i, j;
+
+ /*
+ * NUMA masks are not built for offline nodes in sched_init_numa().
+ * Thus, when a CPU of a never-onlined-before node gets plugged in,
+ * adding that new CPU to the right NUMA masks is not sufficient: the
+ * masks of that CPU's node must also be updated.
+ */
+ if (test_bit(node, sched_numa_onlined_nodes))
+ return;
+
+ bitmap_set(sched_numa_onlined_nodes, node, 1);
+
+ for (i = 0; i < sched_domains_numa_levels; i++) {
+ for (j = 0; j < nr_node_ids; j++) {
+ if (!node_online(j) || node == j)
+ continue;
+
+ if (node_distance(j, node) > sched_domains_numa_distance[i])
+ continue;
+
+ /* Add remote nodes in our masks */
+ cpumask_or(sched_domains_numa_masks[i][node],
+ sched_domains_numa_masks[i][node],
+ sched_domains_numa_masks[0][j]);
+ }
+ }
+
+ /*
+ * A new node has been brought up, potentially changing the topology
+ * classification.
+ *
+ * Note that this is racy vs any use of sched_numa_topology_type :/
+ */
+ init_numa_topology_type();
}
void sched_domains_numa_masks_set(unsigned int cpu)
@@ -1893,8 +1952,14 @@ void sched_domains_numa_masks_set(unsigned int cpu)
int node = cpu_to_node(cpu);
int i, j;
+ __sched_domains_numa_masks_set(node);
+
for (i = 0; i < sched_domains_numa_levels; i++) {
for (j = 0; j < nr_node_ids; j++) {
+ if (!node_online(j))
+ continue;
+
+ /* Set ourselves in the remote node's masks */
if (node_distance(j, node) <= sched_domains_numa_distance[i])
cpumask_set_cpu(cpu, sched_domains_numa_masks[i][j]);
}
diff --git a/kernel/smp.c b/kernel/smp.c
index 52bf159ec400..f43ede0ab183 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -764,7 +764,7 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info,
EXPORT_SYMBOL(smp_call_function_single);
/**
- * smp_call_function_single_async(): Run an asynchronous function on a
+ * smp_call_function_single_async() - Run an asynchronous function on a
* specific CPU.
* @cpu: The CPU to run on.
* @csd: Pre-allocated and setup data structure
@@ -783,6 +783,8 @@ EXPORT_SYMBOL(smp_call_function_single);
*
* NOTE: Be careful, there is unfortunately no current debugging facility to
* validate the correctness of this serialization.
+ *
+ * Return: %0 on success or negative errno value on error
*/
int smp_call_function_single_async(int cpu, struct __call_single_data *csd)
{
@@ -974,7 +976,7 @@ static void smp_call_function_many_cond(const struct cpumask *mask,
* @mask: The set of cpus to run on (only runs on online subset).
* @func: The function to run. This must be fast and non-blocking.
* @info: An arbitrary pointer to pass to the function.
- * @flags: Bitmask that controls the operation. If %SCF_WAIT is set, wait
+ * @wait: Bitmask that controls the operation. If %SCF_WAIT is set, wait
* (atomically) until function has completed on other CPUs. If
* %SCF_RUN_LOCAL is set, the function will also be run locally
* if the local CPU is set in the @cpumask.
@@ -1180,7 +1182,13 @@ void wake_up_all_idle_cpus(void)
EXPORT_SYMBOL_GPL(wake_up_all_idle_cpus);
/**
- * smp_call_on_cpu - Call a function on a specific cpu
+ * struct smp_call_on_cpu_struct - Call a function on a specific CPU
+ * @work: &work_struct
+ * @done: &completion to signal
+ * @func: function to call
+ * @data: function's data argument
+ * @ret: return value from @func
+ * @cpu: target CPU (%-1 for any CPU)
*
* Used to call a function on a specific cpu and wait for it to return.
* Optionally make sure the call is done on a specified physical cpu via vcpu
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index cf6acab78538..f6bc0bc8a2aa 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -291,7 +291,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
unsigned int cpu;
int ret = 0;
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
@@ -304,7 +304,7 @@ int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
+ cpus_read_unlock();
return ret;
}
EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
@@ -317,12 +317,12 @@ EXPORT_SYMBOL_GPL(smpboot_register_percpu_thread);
*/
void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
- get_online_cpus();
+ cpus_read_lock();
mutex_lock(&smpboot_threads_lock);
list_del(&plug_thread->list);
smpboot_destroy_threads(plug_thread);
mutex_unlock(&smpboot_threads_lock);
- put_online_cpus();
+ cpus_read_unlock();
}
EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
diff --git a/kernel/torture.c b/kernel/torture.c
index 0a315c387bed..bb8f411c974b 100644
--- a/kernel/torture.c
+++ b/kernel/torture.c
@@ -521,11 +521,11 @@ static void torture_shuffle_tasks(void)
struct shuffle_task *stp;
cpumask_setall(shuffle_tmp_mask);
- get_online_cpus();
+ cpus_read_lock();
/* No point in shuffling if there is only one online CPU (ex: UP) */
if (num_online_cpus() == 1) {
- put_online_cpus();
+ cpus_read_unlock();
return;
}
@@ -541,7 +541,7 @@ static void torture_shuffle_tasks(void)
set_cpus_allowed_ptr(stp->st_t, shuffle_tmp_mask);
mutex_unlock(&shuffle_task_mutex);
- put_online_cpus();
+ cpus_read_unlock();
}
/* Shuffle tasks across CPUs, with the intent of allowing each CPU in the
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index d567b1717c4c..3ee23f4d437f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -219,6 +219,11 @@ config DYNAMIC_FTRACE_WITH_DIRECT_CALLS
depends on DYNAMIC_FTRACE_WITH_REGS
depends on HAVE_DYNAMIC_FTRACE_WITH_DIRECT_CALLS
+config DYNAMIC_FTRACE_WITH_ARGS
+ def_bool y
+ depends on DYNAMIC_FTRACE
+ depends on HAVE_DYNAMIC_FTRACE_WITH_ARGS
+
config FUNCTION_PROFILER
bool "Kernel function profiler"
depends on FUNCTION_TRACER
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b180f61e6d3..7efbc8aaf7f6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -3100,6 +3100,7 @@ ops_references_rec(struct ftrace_ops *ops, struct dyn_ftrace *rec)
static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
{
+ bool init_nop = ftrace_need_init_nop();
struct ftrace_page *pg;
struct dyn_ftrace *p;
u64 start, stop;
@@ -3138,8 +3139,7 @@ static int ftrace_update_code(struct module *mod, struct ftrace_page *new_pgs)
* Do the initial record conversion from mcount jump
* to the NOP instructions.
*/
- if (!__is_defined(CC_USING_NOP_MCOUNT) &&
- !ftrace_nop_initialize(mod, p))
+ if (init_nop && !ftrace_nop_initialize(mod, p))
break;
update_cnt++;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 33899a71fdc1..a1adb29ef5c1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2897,14 +2897,26 @@ int tracepoint_printk_sysctl(struct ctl_table *table, int write,
void trace_event_buffer_commit(struct trace_event_buffer *fbuffer)
{
+ enum event_trigger_type tt = ETT_NONE;
+ struct trace_event_file *file = fbuffer->trace_file;
+
+ if (__event_trigger_test_discard(file, fbuffer->buffer, fbuffer->event,
+ fbuffer->entry, &tt))
+ goto discard;
+
if (static_key_false(&tracepoint_printk_key.key))
output_printk(fbuffer);
if (static_branch_unlikely(&trace_event_exports_enabled))
ftrace_exports(fbuffer->event, TRACE_EXPORT_EVENT);
- event_trigger_unlock_commit_regs(fbuffer->trace_file, fbuffer->buffer,
- fbuffer->event, fbuffer->entry,
- fbuffer->trace_ctx, fbuffer->regs);
+
+ trace_buffer_unlock_commit_regs(file->tr, fbuffer->buffer,
+ fbuffer->event, fbuffer->trace_ctx, fbuffer->regs);
+
+discard:
+ if (tt)
+ event_triggers_post_call(file, tt);
+
}
EXPORT_SYMBOL_GPL(trace_event_buffer_commit);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a180abf76d4e..4a0e693000c6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -1389,38 +1389,6 @@ event_trigger_unlock_commit(struct trace_event_file *file,
event_triggers_post_call(file, tt);
}
-/**
- * event_trigger_unlock_commit_regs - handle triggers and finish event commit
- * @file: The file pointer associated with the event
- * @buffer: The ring buffer that the event is being written to
- * @event: The event meta data in the ring buffer
- * @entry: The event itself
- * @trace_ctx: The tracing context flags.
- *
- * This is a helper function to handle triggers that require data
- * from the event itself. It also tests the event against filters and
- * if the event is soft disabled and should be discarded.
- *
- * Same as event_trigger_unlock_commit() but calls
- * trace_buffer_unlock_commit_regs() instead of trace_buffer_unlock_commit().
- */
-static inline void
-event_trigger_unlock_commit_regs(struct trace_event_file *file,
- struct trace_buffer *buffer,
- struct ring_buffer_event *event,
- void *entry, unsigned int trace_ctx,
- struct pt_regs *regs)
-{
- enum event_trigger_type tt = ETT_NONE;
-
- if (!__event_trigger_test_discard(file, buffer, event, entry, &tt))
- trace_buffer_unlock_commit_regs(file->tr, buffer, event,
- trace_ctx, regs);
-
- if (tt)
- event_triggers_post_call(file, tt);
-}
-
#define FILTER_PRED_INVALID ((unsigned short)-1)
#define FILTER_PRED_IS_RIGHT (1 << 15)
#define FILTER_PRED_FOLD (1 << 15)
diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c
index 949ef09dc537..a48aa2a2875b 100644
--- a/kernel/trace/trace_events_hist.c
+++ b/kernel/trace/trace_events_hist.c
@@ -3430,6 +3430,8 @@ trace_action_create_field_var(struct hist_trigger_data *hist_data,
event = data->match_data.event;
}
+ if (!event)
+ goto free;
/*
* At this point, we're looking at a field on another
* event. Because we can't modify a hist trigger on
diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c
index a7e3c24dee13..b61eefe5ccf5 100644
--- a/kernel/trace/trace_osnoise.c
+++ b/kernel/trace/trace_osnoise.c
@@ -253,6 +253,7 @@ static struct osnoise_data {
*/
static bool osnoise_busy;
+#ifdef CONFIG_PREEMPT_RT
/*
* Print the osnoise header info.
*/
@@ -261,6 +262,35 @@ static void print_osnoise_headers(struct seq_file *s)
if (osnoise_data.tainted)
seq_puts(s, "# osnoise is tainted!\n");
+ seq_puts(s, "# _-------=> irqs-off\n");
+ seq_puts(s, "# / _------=> need-resched\n");
+ seq_puts(s, "# | / _-----=> need-resched-lazy\n");
+ seq_puts(s, "# || / _----=> hardirq/softirq\n");
+ seq_puts(s, "# ||| / _---=> preempt-depth\n");
+ seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n");
+ seq_puts(s, "# ||||| / _-=> migrate-disable\n");
+
+ seq_puts(s, "# |||||| / ");
+ seq_puts(s, " MAX\n");
+
+ seq_puts(s, "# ||||| / ");
+ seq_puts(s, " SINGLE Interference counters:\n");
+
+ seq_puts(s, "# ||||||| RUNTIME ");
+ seq_puts(s, " NOISE %% OF CPU NOISE +-----------------------------+\n");
+
+ seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP IN US ");
+ seq_puts(s, " IN US AVAILABLE IN US HW NMI IRQ SIRQ THREAD\n");
+
+ seq_puts(s, "# | | | ||||||| | | ");
+ seq_puts(s, " | | | | | | | |\n");
+}
+#else /* CONFIG_PREEMPT_RT */
+static void print_osnoise_headers(struct seq_file *s)
+{
+ if (osnoise_data.tainted)
+ seq_puts(s, "# osnoise is tainted!\n");
+
seq_puts(s, "# _-----=> irqs-off\n");
seq_puts(s, "# / _----=> need-resched\n");
seq_puts(s, "# | / _---=> hardirq/softirq\n");
@@ -279,6 +309,7 @@ static void print_osnoise_headers(struct seq_file *s)
seq_puts(s, "# | | | |||| | | ");
seq_puts(s, " | | | | | | | |\n");
}
+#endif /* CONFIG_PREEMPT_RT */
/*
* osnoise_taint - report an osnoise error.
@@ -323,6 +354,24 @@ static void trace_osnoise_sample(struct osnoise_sample *sample)
/*
* Print the timerlat header info.
*/
+#ifdef CONFIG_PREEMPT_RT
+static void print_timerlat_headers(struct seq_file *s)
+{
+ seq_puts(s, "# _-------=> irqs-off\n");
+ seq_puts(s, "# / _------=> need-resched\n");
+ seq_puts(s, "# | / _-----=> need-resched-lazy\n");
+ seq_puts(s, "# || / _----=> hardirq/softirq\n");
+ seq_puts(s, "# ||| / _---=> preempt-depth\n");
+ seq_puts(s, "# |||| / _--=> preempt-lazy-depth\n");
+ seq_puts(s, "# ||||| / _-=> migrate-disable\n");
+ seq_puts(s, "# |||||| /\n");
+ seq_puts(s, "# ||||||| ACTIVATION\n");
+ seq_puts(s, "# TASK-PID CPU# ||||||| TIMESTAMP ID ");
+ seq_puts(s, " CONTEXT LATENCY\n");
+ seq_puts(s, "# | | | ||||||| | | ");
+ seq_puts(s, " | |\n");
+}
+#else /* CONFIG_PREEMPT_RT */
static void print_timerlat_headers(struct seq_file *s)
{
seq_puts(s, "# _-----=> irqs-off\n");
@@ -336,6 +385,7 @@ static void print_timerlat_headers(struct seq_file *s)
seq_puts(s, "# | | | |||| | | ");
seq_puts(s, " | |\n");
}
+#endif /* CONFIG_PREEMPT_RT */
/*
* Record an timerlat_sample into the tracer buffer.
@@ -1025,9 +1075,13 @@ diff_osn_sample_stats(struct osnoise_variables *osn_var, struct osnoise_sample *
/*
* osnoise_stop_tracing - Stop tracing and the tracer.
*/
-static void osnoise_stop_tracing(void)
+static __always_inline void osnoise_stop_tracing(void)
{
struct trace_array *tr = osnoise_trace;
+
+ trace_array_printk_buf(tr->array_buffer.buffer, _THIS_IP_,
+ "stop tracing hit on cpu %d\n", smp_processor_id());
+
tracer_tracing_off(tr);
}