summaryrefslogtreecommitdiff
path: root/kernel/sched
diff options
context:
space:
mode:
authorIngo Molnar <mingo@kernel.org>2022-06-25 09:40:52 +0200
committerIngo Molnar <mingo@kernel.org>2022-06-25 09:40:52 +0200
commitf1d86ac09d85db494961b6388d64c4b35ea02e5c (patch)
tree386c8be27046f0b3f2c4da9d409a152258ca3bdd /kernel/sched
parent7777248f857a29211f541f0bf5c8fe4e4e6d8769 (diff)
parentf3dd3f674555bd9455c5ae7fafce0696bd9931b3 (diff)
downloadlinux-next-f1d86ac09d85db494961b6388d64c4b35ea02e5c.tar.gz
Merge branch into tip/master: 'sched/core'
* sched/core: f3dd3f674555 sched: Remove the limitation of WF_ON_CPU on wakelist if wakee cpu is idle 28156108fecb sched: Fix the check of nr_running at queue wakelist 792b9f65a568 sched: Allow newidle balancing to bail out of load_balance 2ed81e765417 sched/deadline: Use proc_douintvec_minmax() limit minimum value 51bf903b64bd sched/fair: Optimize and simplify rq leaf_cfs_rq_list f5b2eeb49991 sched/fair: Consider CPU affinity when allowing NUMA imbalance in find_idlest_group() 026b98a93bbd sched/numa: Adjust imb_numa_nr to a better approximation of memory channels cb29a5c19d2d sched/numa: Apply imbalance limitations consistently 13ede3315087 sched/numa: Do not swap tasks between nodes when spare capacity is available 70ce3ea9aa4e sched/numa: Initialise numa_migrate_retry Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r--kernel/sched/core.c30
-rw-r--r--kernel/sched/deadline.c6
-rw-r--r--kernel/sched/fair.c205
-rw-r--r--kernel/sched/sched.h1
-rw-r--r--kernel/sched/topology.c23
5 files changed, 139 insertions, 126 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index da0bf6fe9ecd..daadedc78fd9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3808,7 +3808,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu)
return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
}
-static inline bool ttwu_queue_cond(int cpu, int wake_flags)
+static inline bool ttwu_queue_cond(int cpu)
{
/*
* Do not complicate things with the async wake_list while the CPU is
@@ -3824,13 +3824,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
if (!cpus_share_cache(smp_processor_id(), cpu))
return true;
+ if (cpu == smp_processor_id())
+ return false;
+
/*
- * If the task is descheduling and the only running task on the
- * CPU then use the wakelist to offload the task activation to
- * the soon-to-be-idle CPU as the current CPU is likely busy.
- * nr_running is checked to avoid unnecessary task stacking.
+ * If the wakee cpu is idle, or the task is descheduling and the
+ * only running task on the CPU, then use the wakelist to offload
+ * the task activation to the idle (or soon-to-be-idle) CPU as
+ * the current CPU is likely busy. nr_running is checked to
+ * avoid unnecessary task stacking.
+ *
+ * Note that we can only get here with (wakee) p->on_rq=0,
+ * p->on_cpu can be whatever, we've done the dequeue, so
+ * the wakee has been accounted out of ->nr_running.
*/
- if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1)
+ if (!cpu_rq(cpu)->nr_running)
return true;
return false;
@@ -3838,10 +3846,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags)
static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags)
{
- if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) {
- if (WARN_ON_ONCE(cpu == smp_processor_id()))
- return false;
-
+ if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) {
sched_clock_cpu(cpu); /* Sync clocks across CPUs */
__ttwu_queue_wakelist(p, cpu, wake_flags);
return true;
@@ -4163,7 +4168,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
* scheduling.
*/
if (smp_load_acquire(&p->on_cpu) &&
- ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU))
+ ttwu_queue_wakelist(p, task_cpu(p), wake_flags))
goto unlock;
/*
@@ -4753,7 +4758,8 @@ static inline void prepare_task(struct task_struct *next)
* Claim the task as running, we do this before switching to it
* such that any running task will have this set.
*
- * See the ttwu() WF_ON_CPU case and its ordering comment.
+ * See the smp_load_acquire(&p->on_cpu) case in ttwu() and
+ * its ordering comment.
*/
WRITE_ONCE(next->on_cpu, 1);
#endif
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index b5152961b743..5867e186c39a 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -30,14 +30,16 @@ static struct ctl_table sched_dl_sysctls[] = {
.data = &sysctl_sched_dl_period_max,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra1 = (void *)&sysctl_sched_dl_period_min,
},
{
.procname = "sched_deadline_period_min_us",
.data = &sysctl_sched_dl_period_min,
.maxlen = sizeof(unsigned int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = proc_douintvec_minmax,
+ .extra2 = (void *)&sysctl_sched_dl_period_max,
},
{}
};
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 77b2048a9326..8bed75757e65 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1055,6 +1055,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
* Scheduling class queueing methods:
*/
+#ifdef CONFIG_NUMA
+#define NUMA_IMBALANCE_MIN 2
+
+static inline long
+adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr)
+{
+ /*
+ * Allow a NUMA imbalance if busy CPUs is less than the maximum
+ * threshold. Above this threshold, individual tasks may be contending
+ * for both memory bandwidth and any shared HT resources. This is an
+ * approximation as the number of running tasks may not be related to
+ * the number of busy CPUs due to sched_setaffinity.
+ */
+ if (dst_running > imb_numa_nr)
+ return imbalance;
+
+ /*
+ * Allow a small imbalance based on a simple pair of communicating
+ * tasks that remain local when the destination is lightly loaded.
+ */
+ if (imbalance <= NUMA_IMBALANCE_MIN)
+ return 0;
+
+ return imbalance;
+}
+#endif /* CONFIG_NUMA */
+
#ifdef CONFIG_NUMA_BALANCING
/*
* Approximate time to scan a full NUMA task in ms. The task scan period is
@@ -1548,8 +1575,6 @@ struct task_numa_env {
static unsigned long cpu_load(struct rq *rq);
static unsigned long cpu_runnable(struct rq *rq);
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr);
static inline enum
numa_type numa_classify(unsigned int imbalance_pct,
@@ -1790,6 +1815,15 @@ static bool task_numa_compare(struct task_numa_env *env,
*/
cur_ng = rcu_dereference(cur->numa_group);
if (cur_ng == p_ng) {
+ /*
+ * Do not swap within a group or between tasks that have
+ * no group if there is spare capacity. Swapping does
+ * not address the load imbalance and helps one task at
+ * the cost of punishing another.
+ */
+ if (env->dst_stats.node_type == node_has_spare)
+ goto unlock;
+
imp = taskimp + task_weight(cur, env->src_nid, dist) -
task_weight(cur, env->dst_nid, dist);
/*
@@ -2885,6 +2919,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p)
p->node_stamp = 0;
p->numa_scan_seq = mm ? mm->numa_scan_seq : 0;
p->numa_scan_period = sysctl_numa_balancing_scan_delay;
+ p->numa_migrate_retry = 0;
/* Protect against double add, see task_tick_numa and task_numa_work */
p->numa_work.next = &p->numa_work;
p->numa_faults = NULL;
@@ -3144,6 +3179,8 @@ void reweight_task(struct task_struct *p, int prio)
load->inv_weight = sched_prio_to_wmult[prio];
}
+static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
+
#ifdef CONFIG_FAIR_GROUP_SCHED
#ifdef CONFIG_SMP
/*
@@ -3254,8 +3291,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq)
}
#endif /* CONFIG_SMP */
-static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
-
/*
* Recomputes the group entity based on the current state of its group
* runqueue.
@@ -4368,16 +4403,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
__enqueue_entity(cfs_rq, se);
se->on_rq = 1;
- /*
- * When bandwidth control is enabled, cfs might have been removed
- * because of a parent been throttled but cfs->nr_running > 1. Try to
- * add it unconditionally.
- */
- if (cfs_rq->nr_running == 1 || cfs_bandwidth_used())
- list_add_leaf_cfs_rq(cfs_rq);
-
- if (cfs_rq->nr_running == 1)
+ if (cfs_rq->nr_running == 1) {
check_enqueue_throttle(cfs_rq);
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
+ }
}
static void __clear_buddies_last(struct sched_entity *se)
@@ -4992,11 +5022,18 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* update hierarchical throttle state */
walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
- /* Nothing to run but something to decay (on_list)? Complete the branch */
if (!cfs_rq->load.weight) {
- if (cfs_rq->on_list)
- goto unthrottle_throttle;
- return;
+ if (!cfs_rq->on_list)
+ return;
+ /*
+ * Nothing to run but something to decay (on_list)?
+ * Complete the branch.
+ */
+ for_each_sched_entity(se) {
+ if (list_add_leaf_cfs_rq(cfs_rq_of(se)))
+ break;
+ }
+ goto unthrottle_throttle;
}
task_delta = cfs_rq->h_nr_running;
@@ -5034,31 +5071,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(qcfs_rq))
goto unthrottle_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(qcfs_rq))
- list_add_leaf_cfs_rq(qcfs_rq);
}
/* At this point se is NULL and we are at root level*/
add_nr_running(rq, task_delta);
unthrottle_throttle:
- /*
- * The cfs_rq_throttled() breaks in the above iteration can result in
- * incomplete leaf list maintenance, resulting in triggering the
- * assertion below.
- */
- for_each_sched_entity(se) {
- struct cfs_rq *qcfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(qcfs_rq))
- break;
- }
-
assert_list_leaf_cfs_rq(rq);
/* Determine whether we need to wake up potentially idle CPU: */
@@ -5713,13 +5731,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
/* end evaluation on encountering a throttled cfs_rq */
if (cfs_rq_throttled(cfs_rq))
goto enqueue_throttle;
-
- /*
- * One parent has been throttled and cfs_rq removed from the
- * list. Add it back to not break the leaf list.
- */
- if (throttled_hierarchy(cfs_rq))
- list_add_leaf_cfs_rq(cfs_rq);
}
/* At this point se is NULL and we are at root level*/
@@ -5743,21 +5754,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
update_overutilized_status(rq);
enqueue_throttle:
- if (cfs_bandwidth_used()) {
- /*
- * When bandwidth control is enabled; the cfs_rq_throttled()
- * breaks in the above iteration can result in incomplete
- * leaf list maintenance, resulting in triggering the assertion
- * below.
- */
- for_each_sched_entity(se) {
- cfs_rq = cfs_rq_of(se);
-
- if (list_add_leaf_cfs_rq(cfs_rq))
- break;
- }
- }
-
assert_list_leaf_cfs_rq(rq);
hrtick_update(rq);
@@ -9058,16 +9054,6 @@ static bool update_pick_idlest(struct sched_group *idlest,
}
/*
- * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain.
- * This is an approximation as the number of running tasks may not be
- * related to the number of busy CPUs due to sched_setaffinity.
- */
-static inline bool allow_numa_imbalance(int running, int imb_numa_nr)
-{
- return running <= imb_numa_nr;
-}
-
-/*
* find_idlest_group() finds and returns the least busy CPU group within the
* domain.
*
@@ -9183,7 +9169,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
break;
case group_has_spare:
+#ifdef CONFIG_NUMA
if (sd->flags & SD_NUMA) {
+ int imb_numa_nr = sd->imb_numa_nr;
#ifdef CONFIG_NUMA_BALANCING
int idlest_cpu;
/*
@@ -9196,17 +9184,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu)
idlest_cpu = cpumask_first(sched_group_span(idlest));
if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid)
return idlest;
-#endif
+#endif /* CONFIG_NUMA_BALANCING */
/*
* Otherwise, keep the task close to the wakeup source
* and improve locality if the number of running tasks
* would remain below threshold where an imbalance is
- * allowed. If there is a real need of migration,
- * periodic load balance will take care of it.
+ * allowed while accounting for the possibility the
+ * task is pinned to a subset of CPUs. If there is a
+ * real need of migration, periodic load balance will
+ * take care of it.
*/
- if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr))
+ if (p->nr_cpus_allowed != NR_CPUS) {
+ struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask);
+
+ cpumask_and(cpus, sched_group_span(local), p->cpus_ptr);
+ imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr);
+ }
+
+ imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus);
+ if (!adjust_numa_imbalance(imbalance,
+ local_sgs.sum_nr_running + 1,
+ imb_numa_nr)) {
return NULL;
+ }
}
+#endif /* CONFIG_NUMA */
/*
* Select group with highest number of idle CPUs. We could also
@@ -9293,24 +9295,6 @@ next_group:
}
}
-#define NUMA_IMBALANCE_MIN 2
-
-static inline long adjust_numa_imbalance(int imbalance,
- int dst_running, int imb_numa_nr)
-{
- if (!allow_numa_imbalance(dst_running, imb_numa_nr))
- return imbalance;
-
- /*
- * Allow a small imbalance based on a simple pair of communicating
- * tasks that remain local when the destination is lightly loaded.
- */
- if (imbalance <= NUMA_IMBALANCE_MIN)
- return 0;
-
- return imbalance;
-}
-
/**
* calculate_imbalance - Calculate the amount of imbalance present within the
* groups of a given sched_domain during load balance.
@@ -9395,7 +9379,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
*/
env->migration_type = migrate_task;
lsub_positive(&nr_diff, local->sum_nr_running);
- env->imbalance = nr_diff >> 1;
+ env->imbalance = nr_diff;
} else {
/*
@@ -9403,15 +9387,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
* idle cpus.
*/
env->migration_type = migrate_task;
- env->imbalance = max_t(long, 0, (local->idle_cpus -
- busiest->idle_cpus) >> 1);
+ env->imbalance = max_t(long, 0,
+ (local->idle_cpus - busiest->idle_cpus));
}
+#ifdef CONFIG_NUMA
/* Consider allowing a small imbalance between NUMA groups */
if (env->sd->flags & SD_NUMA) {
env->imbalance = adjust_numa_imbalance(env->imbalance,
- local->sum_nr_running + 1, env->sd->imb_numa_nr);
+ local->sum_nr_running + 1,
+ env->sd->imb_numa_nr);
}
+#endif
+
+ /* Number of tasks to move to restore balance */
+ env->imbalance >>= 1;
return;
}
@@ -9834,9 +9824,15 @@ static int should_we_balance(struct lb_env *env)
/*
* In the newly idle case, we will allow all the CPUs
* to do the newly idle load balance.
+ *
+ * However, we bail out if we already have tasks or a wakeup pending,
+ * to optimize wakeup latency.
*/
- if (env->idle == CPU_NEWLY_IDLE)
+ if (env->idle == CPU_NEWLY_IDLE) {
+ if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending)
+ return 0;
return 1;
+ }
/* Try to find first idle CPU */
for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) {
@@ -11287,9 +11283,13 @@ static inline bool vruntime_normalized(struct task_struct *p)
*/
static void propagate_entity_cfs_rq(struct sched_entity *se)
{
- struct cfs_rq *cfs_rq;
+ struct cfs_rq *cfs_rq = cfs_rq_of(se);
+
+ if (cfs_rq_throttled(cfs_rq))
+ return;
- list_add_leaf_cfs_rq(cfs_rq_of(se));
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
/* Start to propagate at parent */
se = se->parent;
@@ -11297,14 +11297,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se)
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
- if (!cfs_rq_throttled(cfs_rq)){
- update_load_avg(cfs_rq, se, UPDATE_TG);
- list_add_leaf_cfs_rq(cfs_rq);
- continue;
- }
+ update_load_avg(cfs_rq, se, UPDATE_TG);
- if (list_add_leaf_cfs_rq(cfs_rq))
+ if (cfs_rq_throttled(cfs_rq))
break;
+
+ if (!throttled_hierarchy(cfs_rq))
+ list_add_leaf_cfs_rq(cfs_rq);
}
}
#else
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 47b89a0fc6e5..7b19a72408b1 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -2044,7 +2044,6 @@ static inline int task_on_rq_migrating(struct task_struct *p)
#define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */
#define WF_MIGRATED 0x20 /* Internal use, task got migrated */
-#define WF_ON_CPU 0x40 /* Wakee is on_cpu */
#ifdef CONFIG_SMP
static_assert(WF_EXEC == SD_BALANCE_EXEC);
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index 05b6c2ad90b9..8739c2a5a54e 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -2316,23 +2316,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/*
* For a single LLC per node, allow an
- * imbalance up to 25% of the node. This is an
- * arbitrary cutoff based on SMT-2 to balance
- * between memory bandwidth and avoiding
- * premature sharing of HT resources and SMT-4
- * or SMT-8 *may* benefit from a different
- * cutoff.
+ * imbalance up to 12.5% of the node. This is
+ * arbitrary cutoff based two factors -- SMT and
+ * memory channels. For SMT-2, the intent is to
+ * avoid premature sharing of HT resources but
+ * SMT-4 or SMT-8 *may* benefit from a different
+ * cutoff. For memory channels, this is a very
+ * rough estimate of how many channels may be
+ * active and is based on recent CPUs with
+ * many cores.
*
* For multiple LLCs, allow an imbalance
* until multiple tasks would share an LLC
* on one node while LLCs on another node
- * remain idle.
+ * remain idle. This assumes that there are
+ * enough logical CPUs per LLC to avoid SMT
+ * factors and that there is a correlation
+ * between LLCs and memory channels.
*/
nr_llcs = sd->span_weight / child->span_weight;
if (nr_llcs == 1)
- imb = sd->span_weight >> 2;
+ imb = sd->span_weight >> 3;
else
imb = nr_llcs;
+ imb = max(1U, imb);
sd->imb_numa_nr = imb;
/* Set span based on the first NUMA domain. */