summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/build_policy.c2
-rw-r--r--kernel/sched/build_utility.c1
-rw-r--r--kernel/sched/clock.c4
-rw-r--r--kernel/sched/core.c53
-rw-r--r--kernel/sched/deadline.c15
-rw-r--r--kernel/sched/fair.c302
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/pelt.h4
-rw-r--r--kernel/sched/psi.c18
-rw-r--r--kernel/sched/rt.c5
-rw-r--r--kernel/sched/sched.h61
-rw-r--r--kernel/sched/smp.h6
12 files changed, 178 insertions, 298 deletions
diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c
index e0104b45029a..d9dc9ab3773f 100644
--- a/kernel/sched/build_policy.c
+++ b/kernel/sched/build_policy.c
@@ -15,6 +15,7 @@
/* Headers: */
#include <linux/sched/clock.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/hotplug.h>
#include <linux/sched/posix-timers.h>
#include <linux/sched/rt.h>
@@ -31,6 +32,7 @@
#include <uapi/linux/sched/types.h>
#include "sched.h"
+#include "smp.h"
#include "autogroup.h"
#include "stats.h"
diff --git a/kernel/sched/build_utility.c b/kernel/sched/build_utility.c
index eec0849b2aae..99bdd96f454f 100644
--- a/kernel/sched/build_utility.c
+++ b/kernel/sched/build_utility.c
@@ -14,6 +14,7 @@
#include <linux/sched/debug.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
+#include <linux/sched/nohz.h>
#include <linux/sched/mm.h>
#include <linux/sched/rseq_api.h>
#include <linux/sched/task_stack.h>
diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index d9272d9061a3..e374c0c923da 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -287,7 +287,7 @@ again:
clock = wrap_max(clock, min_clock);
clock = wrap_min(clock, max_clock);
- if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
+ if (!try_cmpxchg64(&scd->clock, &old_clock, clock))
goto again;
return clock;
@@ -349,7 +349,7 @@ again:
val = remote_clock;
}
- if (cmpxchg64(ptr, old_val, val) != old_val)
+ if (!try_cmpxchg64(ptr, &old_val, val))
goto again;
return val;
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b8d2ff09b853..696c6490bd5b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -26,7 +26,10 @@
#include <linux/topology.h>
#include <linux/sched/clock.h>
#include <linux/sched/cond_resched.h>
+#include <linux/sched/cputime.h>
#include <linux/sched/debug.h>
+#include <linux/sched/hotplug.h>
+#include <linux/sched/init.h>
#include <linux/sched/isolation.h>
#include <linux/sched/loadavg.h>
#include <linux/sched/mm.h>
@@ -597,10 +600,10 @@ void double_rq_lock(struct rq *rq1, struct rq *rq2)
swap(rq1, rq2);
raw_spin_rq_lock(rq1);
- if (__rq_lockp(rq1) == __rq_lockp(rq2))
- return;
+ if (__rq_lockp(rq1) != __rq_lockp(rq2))
+ raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
- raw_spin_rq_lock_nested(rq2, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(rq1, rq2);
}
#endif
@@ -2184,7 +2187,7 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
{
if (p->sched_class == rq->curr->sched_class)
rq->curr->sched_class->check_preempt_curr(rq, p, flags);
- else if (p->sched_class > rq->curr->sched_class)
+ else if (sched_class_above(p->sched_class, rq->curr->sched_class))
resched_curr(rq);
/*
@@ -2402,7 +2405,7 @@ static int migration_cpu_stop(void *data)
* __migrate_task() such that we will not miss enforcing cpus_ptr
* during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
raw_spin_lock(&p->pi_lock);
rq_lock(rq, &rf);
@@ -5729,7 +5732,7 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
* higher scheduling class, because otherwise those lose the
* opportunity to pull in more work from other CPUs.
*/
- if (likely(prev->sched_class <= &fair_sched_class &&
+ if (likely(!sched_class_above(prev->sched_class, &fair_sched_class) &&
rq->nr_running == rq->cfs.h_nr_running)) {
p = pick_next_task_fair(rq, prev, rf);
@@ -5792,6 +5795,8 @@ static inline struct task_struct *pick_task(struct rq *rq)
extern void task_vruntime_update(struct rq *rq, struct task_struct *p, bool in_fi);
+static void queue_core_balance(struct rq *rq);
+
static struct task_struct *
pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
@@ -5841,7 +5846,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
}
rq->core_pick = NULL;
- return next;
+ goto out;
}
put_prev_task_balance(rq, prev, rf);
@@ -5891,7 +5896,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
*/
WARN_ON_ONCE(fi_before);
task_vruntime_update(rq, next, false);
- goto done;
+ goto out_set_next;
}
}
@@ -6010,8 +6015,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
resched_curr(rq_i);
}
-done:
+out_set_next:
set_next_task(rq, next);
+out:
+ if (rq->core->core_forceidle_count && next == rq->idle)
+ queue_core_balance(rq);
+
return next;
}
@@ -6040,7 +6049,7 @@ static bool try_steal_cookie(int this, int that)
if (p == src->core_pick || p == src->curr)
goto next;
- if (!cpumask_test_cpu(this, &p->cpus_mask))
+ if (!is_cpu_allowed(p, this))
goto next;
if (p->core_occupation > dst->idle->core_occupation)
@@ -6106,7 +6115,7 @@ static void sched_core_balance(struct rq *rq)
static DEFINE_PER_CPU(struct callback_head, core_balance_head);
-void queue_core_balance(struct rq *rq)
+static void queue_core_balance(struct rq *rq)
{
if (!sched_core_enabled(rq))
return;
@@ -6416,7 +6425,7 @@ static void __sched notrace __schedule(unsigned int sched_mode)
migrate_disable_switch(rq, prev);
psi_sched_switch(prev, next, !task_on_rq_queued(prev));
- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev_state, prev, next);
+ trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state);
/* Also unlocks the rq: */
rq = context_switch(rq, prev, next, &rf);
@@ -8449,6 +8458,18 @@ static void __init preempt_dynamic_init(void)
}
}
+#define PREEMPT_MODEL_ACCESSOR(mode) \
+ bool preempt_model_##mode(void) \
+ { \
+ WARN_ON_ONCE(preempt_dynamic_mode == preempt_dynamic_undefined); \
+ return preempt_dynamic_mode == preempt_dynamic_##mode; \
+ } \
+ EXPORT_SYMBOL_GPL(preempt_model_##mode)
+
+PREEMPT_MODEL_ACCESSOR(none);
+PREEMPT_MODEL_ACCESSOR(voluntary);
+PREEMPT_MODEL_ACCESSOR(full);
+
#else /* !CONFIG_PREEMPT_DYNAMIC */
static inline void preempt_dynamic_init(void) { }
@@ -9491,11 +9512,11 @@ void __init sched_init(void)
int i;
/* Make sure the linker didn't screw up */
- BUG_ON(&idle_sched_class + 1 != &fair_sched_class ||
- &fair_sched_class + 1 != &rt_sched_class ||
- &rt_sched_class + 1 != &dl_sched_class);
+ BUG_ON(&idle_sched_class != &fair_sched_class + 1 ||
+ &fair_sched_class != &rt_sched_class + 1 ||
+ &rt_sched_class != &dl_sched_class + 1);
#ifdef CONFIG_SMP
- BUG_ON(&dl_sched_class + 1 != &stop_sched_class);
+ BUG_ON(&dl_sched_class != &stop_sched_class + 1);
#endif
wait_bit_init();
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 82e10b74a7b2..b5152961b743 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1254,8 +1254,6 @@ int dl_runtime_exceeded(struct sched_dl_entity *dl_se)
return (dl_se->runtime <= 0);
}
-extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
-
/*
* This function implements the GRUB accounting rule:
* according to the GRUB reclaiming algorithm, the runtime is
@@ -1866,6 +1864,7 @@ out:
static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused)
{
+ struct rq_flags rf;
struct rq *rq;
if (READ_ONCE(p->__state) != TASK_WAKING)
@@ -1877,7 +1876,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
* from try_to_wake_up(). Hence, p->pi_lock is locked, but
* rq->lock is not... So, lock it
*/
- raw_spin_rq_lock(rq);
+ rq_lock(rq, &rf);
if (p->dl.dl_non_contending) {
update_rq_clock(rq);
sub_running_bw(&p->dl, &rq->dl);
@@ -1893,7 +1892,7 @@ static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused
put_task_struct(p);
}
sub_rq_bw(&p->dl, &rq->dl);
- raw_spin_rq_unlock(rq);
+ rq_unlock(rq, &rf);
}
static void check_preempt_equal_dl(struct rq *rq, struct task_struct *p)
@@ -2353,13 +2352,7 @@ retry:
deactivate_task(rq, next_task, 0);
set_task_cpu(next_task, later_rq->cpu);
-
- /*
- * Update the later_rq clock here, because the clock is used
- * by the cpufreq_update_util() inside __add_running_bw().
- */
- update_rq_clock(later_rq);
- activate_task(later_rq, next_task, ENQUEUE_NOCLOCK);
+ activate_task(later_rq, next_task, 0);
ret = 1;
resched_curr(later_rq);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 265bf7a75a37..8c5b74f66bd3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -36,6 +36,7 @@
#include <linux/sched/cond_resched.h>
#include <linux/sched/cputime.h>
#include <linux/sched/isolation.h>
+#include <linux/sched/nohz.h>
#include <linux/cpuidle.h>
#include <linux/interrupt.h>
@@ -343,19 +344,6 @@ const struct sched_class fair_sched_class;
#define for_each_sched_entity(se) \
for (; se; se = se->parent)
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (!path)
- return;
-
- if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
- autogroup_path(cfs_rq->tg, path, len);
- else if (cfs_rq && cfs_rq->tg->css.cgroup)
- cgroup_path(cfs_rq->tg->css.cgroup, path, len);
- else
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
struct rq *rq = rq_of(cfs_rq);
@@ -523,12 +511,6 @@ static int se_is_idle(struct sched_entity *se)
#define for_each_sched_entity(se) \
for (; se; se = NULL)
-static inline void cfs_rq_tg_path(struct cfs_rq *cfs_rq, char *path, int len)
-{
- if (path)
- strlcpy(path, "(null)", len);
-}
-
static inline bool list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
{
return true;
@@ -3859,11 +3841,11 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
se->avg.runnable_sum = se->avg.runnable_avg * divider;
- se->avg.load_sum = divider;
- if (se_weight(se)) {
- se->avg.load_sum =
- div_u64(se->avg.load_avg * se->avg.load_sum, se_weight(se));
- }
+ se->avg.load_sum = se->avg.load_avg * divider;
+ if (se_weight(se) < se->avg.load_sum)
+ se->avg.load_sum = div_u64(se->avg.load_sum, se_weight(se));
+ else
+ se->avg.load_sum = 1;
enqueue_load_avg(cfs_rq, se);
cfs_rq->avg.util_avg += se->avg.util_avg;
@@ -4876,11 +4858,11 @@ static int tg_unthrottle_up(struct task_group *tg, void *data)
cfs_rq->throttle_count--;
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task_time += rq_clock_task(rq) -
- cfs_rq->throttled_clock_task;
+ cfs_rq->throttled_clock_pelt_time += rq_clock_pelt(rq) -
+ cfs_rq->throttled_clock_pelt;
/* Add cfs_rq with load or one or more already running entities to the list */
- if (!cfs_rq_is_decayed(cfs_rq) || cfs_rq->nr_running)
+ if (!cfs_rq_is_decayed(cfs_rq))
list_add_leaf_cfs_rq(cfs_rq);
}
@@ -4894,7 +4876,7 @@ static int tg_throttle_down(struct task_group *tg, void *data)
/* group is entering throttled state, stop time */
if (!cfs_rq->throttle_count) {
- cfs_rq->throttled_clock_task = rq_clock_task(rq);
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(rq);
list_del_leaf_cfs_rq(cfs_rq);
}
cfs_rq->throttle_count++;
@@ -5338,7 +5320,7 @@ static void sync_throttle(struct task_group *tg, int cpu)
pcfs_rq = tg->parent->cfs_rq[cpu];
cfs_rq->throttle_count = pcfs_rq->throttle_count;
- cfs_rq->throttled_clock_task = rq_clock_task(cpu_rq(cpu));
+ cfs_rq->throttled_clock_pelt = rq_clock_pelt(cpu_rq(cpu));
}
/* conditionally throttle active cfs_rq's from put_prev_entity() */
@@ -6574,108 +6556,19 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
}
/*
- * cpu_util_without: compute cpu utilization without any contributions from *p
- * @cpu: the CPU which utilization is requested
- * @p: the task which utilization should be discounted
- *
- * The utilization of a CPU is defined by the utilization of tasks currently
- * enqueued on that CPU as well as tasks which are currently sleeping after an
- * execution on that CPU.
- *
- * This method returns the utilization of the specified CPU by discounting the
- * utilization of the specified task, whenever the task is currently
- * contributing to the CPU utilization.
- */
-static unsigned long cpu_util_without(int cpu, struct task_struct *p)
-{
- struct cfs_rq *cfs_rq;
- unsigned int util;
-
- /* Task has no contribution or is new */
- if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
- return cpu_util_cfs(cpu);
-
- cfs_rq = &cpu_rq(cpu)->cfs;
- util = READ_ONCE(cfs_rq->avg.util_avg);
-
- /* Discount task's util from CPU's util */
- lsub_positive(&util, task_util(p));
-
- /*
- * Covered cases:
- *
- * a) if *p is the only task sleeping on this CPU, then:
- * cpu_util (== task_util) > util_est (== 0)
- * and thus we return:
- * cpu_util_without = (cpu_util - task_util) = 0
- *
- * b) if other tasks are SLEEPING on this CPU, which is now exiting
- * IDLE, then:
- * cpu_util >= task_util
- * cpu_util > util_est (== 0)
- * and thus we discount *p's blocked utilization to return:
- * cpu_util_without = (cpu_util - task_util) >= 0
- *
- * c) if other tasks are RUNNABLE on that CPU and
- * util_est > cpu_util
- * then we use util_est since it returns a more restrictive
- * estimation of the spare capacity on that CPU, by just
- * considering the expected utilization of tasks already
- * runnable on that CPU.
- *
- * Cases a) and b) are covered by the above code, while case c) is
- * covered by the following code when estimated utilization is
- * enabled.
- */
- if (sched_feat(UTIL_EST)) {
- unsigned int estimated =
- READ_ONCE(cfs_rq->avg.util_est.enqueued);
-
- /*
- * Despite the following checks we still have a small window
- * for a possible race, when an execl's select_task_rq_fair()
- * races with LB's detach_task():
- *
- * detach_task()
- * p->on_rq = TASK_ON_RQ_MIGRATING;
- * ---------------------------------- A
- * deactivate_task() \
- * dequeue_task() + RaceTime
- * util_est_dequeue() /
- * ---------------------------------- B
- *
- * The additional check on "current == p" it's required to
- * properly fix the execl regression and it helps in further
- * reducing the chances for the above race.
- */
- if (unlikely(task_on_rq_queued(p) || current == p))
- lsub_positive(&estimated, _task_util_est(p));
-
- util = max(util, estimated);
- }
-
- /*
- * Utilization (estimated) can exceed the CPU capacity, thus let's
- * clamp to the maximum CPU capacity to ensure consistency with
- * cpu_util.
- */
- return min_t(unsigned long, util, capacity_orig_of(cpu));
-}
-
-/*
- * Predicts what cpu_util(@cpu) would return if @p was migrated (and enqueued)
- * to @dst_cpu.
+ * Predicts what cpu_util(@cpu) would return if @p was removed from @cpu
+ * (@dst_cpu = -1) or migrated to @dst_cpu.
*/
static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
{
struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
- unsigned long util_est, util = READ_ONCE(cfs_rq->avg.util_avg);
+ unsigned long util = READ_ONCE(cfs_rq->avg.util_avg);
/*
- * If @p migrates from @cpu to another, remove its contribution. Or,
- * if @p migrates from another CPU to @cpu, add its contribution. In
- * the other cases, @cpu is not impacted by the migration, so the
- * util_avg should already be correct.
+ * If @dst_cpu is -1 or @p migrates from @cpu to @dst_cpu remove its
+ * contribution. If @p migrates from another CPU to @cpu add its
+ * contribution. In all the other cases @cpu is not impacted by the
+ * migration so its util_avg is already correct.
*/
if (task_cpu(p) == cpu && dst_cpu != cpu)
lsub_positive(&util, task_util(p));
@@ -6683,16 +6576,40 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
util += task_util(p);
if (sched_feat(UTIL_EST)) {
+ unsigned long util_est;
+
util_est = READ_ONCE(cfs_rq->avg.util_est.enqueued);
/*
- * During wake-up, the task isn't enqueued yet and doesn't
- * appear in the cfs_rq->avg.util_est.enqueued of any rq,
- * so just add it (if needed) to "simulate" what will be
- * cpu_util after the task has been enqueued.
+ * During wake-up @p isn't enqueued yet and doesn't contribute
+ * to any cpu_rq(cpu)->cfs.avg.util_est.enqueued.
+ * If @dst_cpu == @cpu add it to "simulate" cpu_util after @p
+ * has been enqueued.
+ *
+ * During exec (@dst_cpu = -1) @p is enqueued and does
+ * contribute to cpu_rq(cpu)->cfs.util_est.enqueued.
+ * Remove it to "simulate" cpu_util without @p's contribution.
+ *
+ * Despite the task_on_rq_queued(@p) check there is still a
+ * small window for a possible race when an exec
+ * select_task_rq_fair() races with LB's detach_task().
+ *
+ * detach_task()
+ * deactivate_task()
+ * p->on_rq = TASK_ON_RQ_MIGRATING;
+ * -------------------------------- A
+ * dequeue_task() \
+ * dequeue_task_fair() + Race Time
+ * util_est_dequeue() /
+ * -------------------------------- B
+ *
+ * The additional check "current == p" is required to further
+ * reduce the race window.
*/
if (dst_cpu == cpu)
util_est += _task_util_est(p);
+ else if (unlikely(task_on_rq_queued(p) || current == p))
+ lsub_positive(&util_est, _task_util_est(p));
util = max(util, util_est);
}
@@ -6701,6 +6618,28 @@ static unsigned long cpu_util_next(int cpu, struct task_struct *p, int dst_cpu)
}
/*
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
+ */
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
+{
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_util_cfs(cpu);
+
+ return cpu_util_next(cpu, p, -1);
+}
+
+/*
* compute_energy(): Estimates the energy that @pd would consume if @p was
* migrated to @dst_cpu. compute_energy() predicts what will be the utilization
* landscape of @pd's CPUs after the task migration, and uses the Energy Model
@@ -9490,8 +9429,6 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
local->avg_load = (local->group_load * SCHED_CAPACITY_SCALE) /
local->group_capacity;
- sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
- sds->total_capacity;
/*
* If the local group is more loaded than the selected
* busiest group don't try to pull any tasks.
@@ -9500,6 +9437,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
env->imbalance = 0;
return;
}
+
+ sds->avg_load = (sds->total_load * SCHED_CAPACITY_SCALE) /
+ sds->total_capacity;
}
/*
@@ -9525,7 +9465,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* busiest \ local has_spare fully_busy misfit asym imbalanced overloaded
* has_spare nr_idle balanced N/A N/A balanced balanced
* fully_busy nr_idle nr_idle N/A N/A balanced balanced
- * misfit_task force N/A N/A N/A force force
+ * misfit_task force N/A N/A N/A N/A N/A
* asym_packing force force N/A N/A force force
* imbalanced force force N/A N/A force force
* overloaded force force N/A N/A force avg_load
@@ -11911,101 +11851,3 @@ __init void init_sched_fair_class(void)
#endif /* SMP */
}
-
-/*
- * Helper functions to facilitate extracting info from tracepoints.
- */
-
-const struct sched_avg *sched_trace_cfs_rq_avg(struct cfs_rq *cfs_rq)
-{
-#ifdef CONFIG_SMP
- return cfs_rq ? &cfs_rq->avg : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_avg);
-
-char *sched_trace_cfs_rq_path(struct cfs_rq *cfs_rq, char *str, int len)
-{
- if (!cfs_rq) {
- if (str)
- strlcpy(str, "(null)", len);
- else
- return NULL;
- }
-
- cfs_rq_tg_path(cfs_rq, str, len);
- return str;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_path);
-
-int sched_trace_cfs_rq_cpu(struct cfs_rq *cfs_rq)
-{
- return cfs_rq ? cpu_of(rq_of(cfs_rq)) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_cfs_rq_cpu);
-
-const struct sched_avg *sched_trace_rq_avg_rt(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_rt : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_rt);
-
-const struct sched_avg *sched_trace_rq_avg_dl(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq ? &rq->avg_dl : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_dl);
-
-const struct sched_avg *sched_trace_rq_avg_irq(struct rq *rq)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_HAVE_SCHED_AVG_IRQ)
- return rq ? &rq->avg_irq : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_avg_irq);
-
-int sched_trace_rq_cpu(struct rq *rq)
-{
- return rq ? cpu_of(rq) : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu);
-
-int sched_trace_rq_cpu_capacity(struct rq *rq)
-{
- return rq ?
-#ifdef CONFIG_SMP
- rq->cpu_capacity
-#else
- SCHED_CAPACITY_SCALE
-#endif
- : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_cpu_capacity);
-
-const struct cpumask *sched_trace_rd_span(struct root_domain *rd)
-{
-#ifdef CONFIG_SMP
- return rd ? rd->span : NULL;
-#else
- return NULL;
-#endif
-}
-EXPORT_SYMBOL_GPL(sched_trace_rd_span);
-
-int sched_trace_rq_nr_running(struct rq *rq)
-{
- return rq ? rq->nr_running : -1;
-}
-EXPORT_SYMBOL_GPL(sched_trace_rq_nr_running);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 8f8b5020e76a..328cccbee444 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -102,7 +102,7 @@ void __cpuidle default_idle_call(void)
* last -- this is very similar to the entry code.
*/
trace_hardirqs_on_prepare();
- lockdep_hardirqs_on_prepare(_THIS_IP_);
+ lockdep_hardirqs_on_prepare();
rcu_idle_enter();
lockdep_hardirqs_on(_THIS_IP_);
@@ -327,7 +327,7 @@ static void do_idle(void)
* RCU relies on this call to be done outside of an RCU read-side
* critical section.
*/
- flush_smp_call_function_from_idle();
+ flush_smp_call_function_queue();
schedule_idle();
if (unlikely(klp_patch_pending(current)))
@@ -434,7 +434,6 @@ static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool fir
{
update_idle_core(rq);
schedstat_inc(rq->sched_goidle);
- queue_core_balance(rq);
}
#ifdef CONFIG_SMP
diff --git a/kernel/sched/pelt.h b/kernel/sched/pelt.h
index c336f5f481bc..4ff2ed4f8fa1 100644
--- a/kernel/sched/pelt.h
+++ b/kernel/sched/pelt.h
@@ -145,9 +145,9 @@ static inline u64 rq_clock_pelt(struct rq *rq)
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
{
if (unlikely(cfs_rq->throttle_count))
- return cfs_rq->throttled_clock_task - cfs_rq->throttled_clock_task_time;
+ return cfs_rq->throttled_clock_pelt - cfs_rq->throttled_clock_pelt_time;
- return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_task_time;
+ return rq_clock_pelt(rq_of(cfs_rq)) - cfs_rq->throttled_clock_pelt_time;
}
#else
static inline u64 cfs_rq_clock_pelt(struct cfs_rq *cfs_rq)
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index a4fa3aadfcba..a337f3e35997 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -1060,14 +1060,17 @@ int psi_show(struct seq_file *m, struct psi_group *group, enum psi_res res)
mutex_unlock(&group->avgs_lock);
for (full = 0; full < 2; full++) {
- unsigned long avg[3];
- u64 total;
+ unsigned long avg[3] = { 0, };
+ u64 total = 0;
int w;
- for (w = 0; w < 3; w++)
- avg[w] = group->avg[res * 2 + full][w];
- total = div_u64(group->total[PSI_AVGS][res * 2 + full],
- NSEC_PER_USEC);
+ /* CPU FULL is undefined at the system level */
+ if (!(group == &psi_system && res == PSI_CPU && full)) {
+ for (w = 0; w < 3; w++)
+ avg[w] = group->avg[res * 2 + full][w];
+ total = div_u64(group->total[PSI_AVGS][res * 2 + full],
+ NSEC_PER_USEC);
+ }
seq_printf(m, "%s avg10=%lu.%02lu avg60=%lu.%02lu avg300=%lu.%02lu total=%llu\n",
full ? "full" : "some",
@@ -1117,7 +1120,8 @@ struct psi_trigger *psi_trigger_create(struct psi_group *group,
t->state = state;
t->threshold = threshold_us * NSEC_PER_USEC;
t->win.size = window_us * NSEC_PER_USEC;
- window_reset(&t->win, 0, 0, 0);
+ window_reset(&t->win, sched_clock(),
+ group->total[PSI_POLL][t->state], 0);
t->event = 0;
t->last_event_time = 0;
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index b491a0f8c25d..8c9ed9664840 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -921,6 +921,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
int enqueue = 0;
struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
struct rq *rq = rq_of_rt_rq(rt_rq);
+ struct rq_flags rf;
int skip;
/*
@@ -935,7 +936,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (skip)
continue;
- raw_spin_rq_lock(rq);
+ rq_lock(rq, &rf);
update_rq_clock(rq);
if (rt_rq->rt_time) {
@@ -973,7 +974,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
if (enqueue)
sched_rt_rq_enqueue(rt_rq);
- raw_spin_rq_unlock(rq);
+ rq_unlock(rq, &rf);
}
if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index ae0f6e5a76f9..01259611beb9 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -610,8 +610,8 @@ struct cfs_rq {
s64 runtime_remaining;
u64 throttled_clock;
- u64 throttled_clock_task;
- u64 throttled_clock_task_time;
+ u64 throttled_clock_pelt;
+ u64 throttled_clock_pelt_time;
int throttled;
int throttle_count;
struct list_head throttled_list;
@@ -1239,8 +1239,6 @@ static inline bool sched_group_cookie_match(struct rq *rq,
return false;
}
-extern void queue_core_balance(struct rq *rq);
-
static inline bool sched_core_enqueued(struct task_struct *p)
{
return !RB_EMPTY_NODE(&p->core_node);
@@ -1274,10 +1272,6 @@ static inline raw_spinlock_t *__rq_lockp(struct rq *rq)
return &rq->__lock;
}
-static inline void queue_core_balance(struct rq *rq)
-{
-}
-
static inline bool sched_cpu_cookie_match(struct rq *rq, struct task_struct *p)
{
return true;
@@ -1840,12 +1834,7 @@ static inline void dirty_sched_domain_sysctl(int cpu)
#endif
extern int sched_update_scaling(void);
-
-extern void flush_smp_call_function_from_idle(void);
-
-#else /* !CONFIG_SMP: */
-static inline void flush_smp_call_function_from_idle(void) { }
-#endif
+#endif /* CONFIG_SMP */
#include "stats.h"
@@ -2195,6 +2184,8 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
*
* include/asm-generic/vmlinux.lds.h
*
+ * *CAREFUL* they are laid out in *REVERSE* order!!!
+ *
* Also enforce alignment on the instance, not the type, to guarantee layout.
*/
#define DEFINE_SCHED_CLASS(name) \
@@ -2203,17 +2194,16 @@ const struct sched_class name##_sched_class \
__section("__" #name "_sched_class")
/* Defined in include/asm-generic/vmlinux.lds.h */
-extern struct sched_class __begin_sched_classes[];
-extern struct sched_class __end_sched_classes[];
-
-#define sched_class_highest (__end_sched_classes - 1)
-#define sched_class_lowest (__begin_sched_classes - 1)
+extern struct sched_class __sched_class_highest[];
+extern struct sched_class __sched_class_lowest[];
#define for_class_range(class, _from, _to) \
- for (class = (_from); class != (_to); class--)
+ for (class = (_from); class < (_to); class++)
#define for_each_class(class) \
- for_class_range(class, sched_class_highest, sched_class_lowest)
+ for_class_range(class, __sched_class_highest, __sched_class_lowest)
+
+#define sched_class_above(_a, _b) ((_a) < (_b))
extern const struct sched_class stop_sched_class;
extern const struct sched_class dl_sched_class;
@@ -2322,6 +2312,7 @@ extern void resched_cpu(int cpu);
extern struct rt_bandwidth def_rt_bandwidth;
extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime);
+extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
extern void init_dl_bandwidth(struct dl_bandwidth *dl_b, u64 period, u64 runtime);
extern void init_dl_task_timer(struct sched_dl_entity *dl_se);
@@ -2491,6 +2482,24 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
+#ifdef CONFIG_SCHED_DEBUG
+/*
+ * In double_lock_balance()/double_rq_lock(), we use raw_spin_rq_lock() to
+ * acquire rq lock instead of rq_lock(). So at the end of these two functions
+ * we need to call double_rq_clock_clear_update() to clear RQCF_UPDATED of
+ * rq->clock_update_flags to avoid the WARN_DOUBLE_CLOCK warning.
+ */
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2)
+{
+ rq1->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+ /* rq1 == rq2 for !CONFIG_SMP, so just clear RQCF_UPDATED once. */
+#ifdef CONFIG_SMP
+ rq2->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP);
+#endif
+}
+#else
+static inline void double_rq_clock_clear_update(struct rq *rq1, struct rq *rq2) {}
+#endif
#ifdef CONFIG_SMP
@@ -2556,14 +2565,15 @@ static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
__acquires(busiest->lock)
__acquires(this_rq->lock)
{
- if (__rq_lockp(this_rq) == __rq_lockp(busiest))
- return 0;
-
- if (likely(raw_spin_rq_trylock(busiest)))
+ if (__rq_lockp(this_rq) == __rq_lockp(busiest) ||
+ likely(raw_spin_rq_trylock(busiest))) {
+ double_rq_clock_clear_update(this_rq, busiest);
return 0;
+ }
if (rq_order_less(this_rq, busiest)) {
raw_spin_rq_lock_nested(busiest, SINGLE_DEPTH_NESTING);
+ double_rq_clock_clear_update(this_rq, busiest);
return 0;
}
@@ -2657,6 +2667,7 @@ static inline void double_rq_lock(struct rq *rq1, struct rq *rq2)
BUG_ON(rq1 != rq2);
raw_spin_rq_lock(rq1);
__acquire(rq2->lock); /* Fake it out ;) */
+ double_rq_clock_clear_update(rq1, rq2);
}
/*
diff --git a/kernel/sched/smp.h b/kernel/sched/smp.h
index 9620e323162c..2eb23dd0f285 100644
--- a/kernel/sched/smp.h
+++ b/kernel/sched/smp.h
@@ -7,3 +7,9 @@
extern void sched_ttwu_pending(void *arg);
extern void send_call_function_single_ipi(int cpu);
+
+#ifdef CONFIG_SMP
+extern void flush_smp_call_function_queue(void);
+#else
+static inline void flush_smp_call_function_queue(void) { }
+#endif