diff options
author | Ingo Molnar <mingo@kernel.org> | 2022-06-25 09:40:52 +0200 |
---|---|---|
committer | Ingo Molnar <mingo@kernel.org> | 2022-06-25 09:40:52 +0200 |
commit | f1d86ac09d85db494961b6388d64c4b35ea02e5c (patch) | |
tree | 386c8be27046f0b3f2c4da9d409a152258ca3bdd /kernel/sched | |
parent | 7777248f857a29211f541f0bf5c8fe4e4e6d8769 (diff) | |
parent | f3dd3f674555bd9455c5ae7fafce0696bd9931b3 (diff) | |
download | linux-next-f1d86ac09d85db494961b6388d64c4b35ea02e5c.tar.gz |
Merge branch into tip/master: 'sched/core'
* sched/core:
f3dd3f674555 sched: Remove the limitation of WF_ON_CPU on wakelist if wakee cpu is idle
28156108fecb sched: Fix the check of nr_running at queue wakelist
792b9f65a568 sched: Allow newidle balancing to bail out of load_balance
2ed81e765417 sched/deadline: Use proc_douintvec_minmax() limit minimum value
51bf903b64bd sched/fair: Optimize and simplify rq leaf_cfs_rq_list
f5b2eeb49991 sched/fair: Consider CPU affinity when allowing NUMA imbalance in find_idlest_group()
026b98a93bbd sched/numa: Adjust imb_numa_nr to a better approximation of memory channels
cb29a5c19d2d sched/numa: Apply imbalance limitations consistently
13ede3315087 sched/numa: Do not swap tasks between nodes when spare capacity is available
70ce3ea9aa4e sched/numa: Initialise numa_migrate_retry
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Diffstat (limited to 'kernel/sched')
-rw-r--r-- | kernel/sched/core.c | 30 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 205 | ||||
-rw-r--r-- | kernel/sched/sched.h | 1 | ||||
-rw-r--r-- | kernel/sched/topology.c | 23 |
5 files changed, 139 insertions, 126 deletions
diff --git a/kernel/sched/core.c b/kernel/sched/core.c index da0bf6fe9ecd..daadedc78fd9 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3808,7 +3808,7 @@ bool cpus_share_cache(int this_cpu, int that_cpu) return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu); } -static inline bool ttwu_queue_cond(int cpu, int wake_flags) +static inline bool ttwu_queue_cond(int cpu) { /* * Do not complicate things with the async wake_list while the CPU is @@ -3824,13 +3824,21 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) if (!cpus_share_cache(smp_processor_id(), cpu)) return true; + if (cpu == smp_processor_id()) + return false; + /* - * If the task is descheduling and the only running task on the - * CPU then use the wakelist to offload the task activation to - * the soon-to-be-idle CPU as the current CPU is likely busy. - * nr_running is checked to avoid unnecessary task stacking. + * If the wakee cpu is idle, or the task is descheduling and the + * only running task on the CPU, then use the wakelist to offload + * the task activation to the idle (or soon-to-be-idle) CPU as + * the current CPU is likely busy. nr_running is checked to + * avoid unnecessary task stacking. + * + * Note that we can only get here with (wakee) p->on_rq=0, + * p->on_cpu can be whatever, we've done the dequeue, so + * the wakee has been accounted out of ->nr_running. */ - if ((wake_flags & WF_ON_CPU) && cpu_rq(cpu)->nr_running <= 1) + if (!cpu_rq(cpu)->nr_running) return true; return false; @@ -3838,10 +3846,7 @@ static inline bool ttwu_queue_cond(int cpu, int wake_flags) static bool ttwu_queue_wakelist(struct task_struct *p, int cpu, int wake_flags) { - if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu, wake_flags)) { - if (WARN_ON_ONCE(cpu == smp_processor_id())) - return false; - + if (sched_feat(TTWU_QUEUE) && ttwu_queue_cond(cpu)) { sched_clock_cpu(cpu); /* Sync clocks across CPUs */ __ttwu_queue_wakelist(p, cpu, wake_flags); return true; @@ -4163,7 +4168,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) * scheduling. */ if (smp_load_acquire(&p->on_cpu) && - ttwu_queue_wakelist(p, task_cpu(p), wake_flags | WF_ON_CPU)) + ttwu_queue_wakelist(p, task_cpu(p), wake_flags)) goto unlock; /* @@ -4753,7 +4758,8 @@ static inline void prepare_task(struct task_struct *next) * Claim the task as running, we do this before switching to it * such that any running task will have this set. * - * See the ttwu() WF_ON_CPU case and its ordering comment. + * See the smp_load_acquire(&p->on_cpu) case in ttwu() and + * its ordering comment. */ WRITE_ONCE(next->on_cpu, 1); #endif diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index b5152961b743..5867e186c39a 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -30,14 +30,16 @@ static struct ctl_table sched_dl_sysctls[] = { .data = &sysctl_sched_dl_period_max, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec_minmax, + .extra1 = (void *)&sysctl_sched_dl_period_min, }, { .procname = "sched_deadline_period_min_us", .data = &sysctl_sched_dl_period_min, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec_minmax, + .extra2 = (void *)&sysctl_sched_dl_period_max, }, {} }; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 77b2048a9326..8bed75757e65 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -1055,6 +1055,33 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#ifdef CONFIG_NUMA +#define NUMA_IMBALANCE_MIN 2 + +static inline long +adjust_numa_imbalance(int imbalance, int dst_running, int imb_numa_nr) +{ + /* + * Allow a NUMA imbalance if busy CPUs is less than the maximum + * threshold. Above this threshold, individual tasks may be contending + * for both memory bandwidth and any shared HT resources. This is an + * approximation as the number of running tasks may not be related to + * the number of busy CPUs due to sched_setaffinity. + */ + if (dst_running > imb_numa_nr) + return imbalance; + + /* + * Allow a small imbalance based on a simple pair of communicating + * tasks that remain local when the destination is lightly loaded. + */ + if (imbalance <= NUMA_IMBALANCE_MIN) + return 0; + + return imbalance; +} +#endif /* CONFIG_NUMA */ + #ifdef CONFIG_NUMA_BALANCING /* * Approximate time to scan a full NUMA task in ms. The task scan period is @@ -1548,8 +1575,6 @@ struct task_numa_env { static unsigned long cpu_load(struct rq *rq); static unsigned long cpu_runnable(struct rq *rq); -static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int imb_numa_nr); static inline enum numa_type numa_classify(unsigned int imbalance_pct, @@ -1790,6 +1815,15 @@ static bool task_numa_compare(struct task_numa_env *env, */ cur_ng = rcu_dereference(cur->numa_group); if (cur_ng == p_ng) { + /* + * Do not swap within a group or between tasks that have + * no group if there is spare capacity. Swapping does + * not address the load imbalance and helps one task at + * the cost of punishing another. + */ + if (env->dst_stats.node_type == node_has_spare) + goto unlock; + imp = taskimp + task_weight(cur, env->src_nid, dist) - task_weight(cur, env->dst_nid, dist); /* @@ -2885,6 +2919,7 @@ void init_numa_balancing(unsigned long clone_flags, struct task_struct *p) p->node_stamp = 0; p->numa_scan_seq = mm ? mm->numa_scan_seq : 0; p->numa_scan_period = sysctl_numa_balancing_scan_delay; + p->numa_migrate_retry = 0; /* Protect against double add, see task_tick_numa and task_numa_work */ p->numa_work.next = &p->numa_work; p->numa_faults = NULL; @@ -3144,6 +3179,8 @@ void reweight_task(struct task_struct *p, int prio) load->inv_weight = sched_prio_to_wmult[prio]; } +static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); + #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_SMP /* @@ -3254,8 +3291,6 @@ static long calc_group_shares(struct cfs_rq *cfs_rq) } #endif /* CONFIG_SMP */ -static inline int throttled_hierarchy(struct cfs_rq *cfs_rq); - /* * Recomputes the group entity based on the current state of its group * runqueue. @@ -4368,16 +4403,11 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) __enqueue_entity(cfs_rq, se); se->on_rq = 1; - /* - * When bandwidth control is enabled, cfs might have been removed - * because of a parent been throttled but cfs->nr_running > 1. Try to - * add it unconditionally. - */ - if (cfs_rq->nr_running == 1 || cfs_bandwidth_used()) - list_add_leaf_cfs_rq(cfs_rq); - - if (cfs_rq->nr_running == 1) + if (cfs_rq->nr_running == 1) { check_enqueue_throttle(cfs_rq); + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); + } } static void __clear_buddies_last(struct sched_entity *se) @@ -4992,11 +5022,18 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* update hierarchical throttle state */ walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq); - /* Nothing to run but something to decay (on_list)? Complete the branch */ if (!cfs_rq->load.weight) { - if (cfs_rq->on_list) - goto unthrottle_throttle; - return; + if (!cfs_rq->on_list) + return; + /* + * Nothing to run but something to decay (on_list)? + * Complete the branch. + */ + for_each_sched_entity(se) { + if (list_add_leaf_cfs_rq(cfs_rq_of(se))) + break; + } + goto unthrottle_throttle; } task_delta = cfs_rq->h_nr_running; @@ -5034,31 +5071,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(qcfs_rq)) goto unthrottle_throttle; - - /* - * One parent has been throttled and cfs_rq removed from the - * list. Add it back to not break the leaf list. - */ - if (throttled_hierarchy(qcfs_rq)) - list_add_leaf_cfs_rq(qcfs_rq); } /* At this point se is NULL and we are at root level*/ add_nr_running(rq, task_delta); unthrottle_throttle: - /* - * The cfs_rq_throttled() breaks in the above iteration can result in - * incomplete leaf list maintenance, resulting in triggering the - * assertion below. - */ - for_each_sched_entity(se) { - struct cfs_rq *qcfs_rq = cfs_rq_of(se); - - if (list_add_leaf_cfs_rq(qcfs_rq)) - break; - } - assert_list_leaf_cfs_rq(rq); /* Determine whether we need to wake up potentially idle CPU: */ @@ -5713,13 +5731,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) /* end evaluation on encountering a throttled cfs_rq */ if (cfs_rq_throttled(cfs_rq)) goto enqueue_throttle; - - /* - * One parent has been throttled and cfs_rq removed from the - * list. Add it back to not break the leaf list. - */ - if (throttled_hierarchy(cfs_rq)) - list_add_leaf_cfs_rq(cfs_rq); } /* At this point se is NULL and we are at root level*/ @@ -5743,21 +5754,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) update_overutilized_status(rq); enqueue_throttle: - if (cfs_bandwidth_used()) { - /* - * When bandwidth control is enabled; the cfs_rq_throttled() - * breaks in the above iteration can result in incomplete - * leaf list maintenance, resulting in triggering the assertion - * below. - */ - for_each_sched_entity(se) { - cfs_rq = cfs_rq_of(se); - - if (list_add_leaf_cfs_rq(cfs_rq)) - break; - } - } - assert_list_leaf_cfs_rq(rq); hrtick_update(rq); @@ -9058,16 +9054,6 @@ static bool update_pick_idlest(struct sched_group *idlest, } /* - * Allow a NUMA imbalance if busy CPUs is less than 25% of the domain. - * This is an approximation as the number of running tasks may not be - * related to the number of busy CPUs due to sched_setaffinity. - */ -static inline bool allow_numa_imbalance(int running, int imb_numa_nr) -{ - return running <= imb_numa_nr; -} - -/* * find_idlest_group() finds and returns the least busy CPU group within the * domain. * @@ -9183,7 +9169,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) break; case group_has_spare: +#ifdef CONFIG_NUMA if (sd->flags & SD_NUMA) { + int imb_numa_nr = sd->imb_numa_nr; #ifdef CONFIG_NUMA_BALANCING int idlest_cpu; /* @@ -9196,17 +9184,31 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) idlest_cpu = cpumask_first(sched_group_span(idlest)); if (cpu_to_node(idlest_cpu) == p->numa_preferred_nid) return idlest; -#endif +#endif /* CONFIG_NUMA_BALANCING */ /* * Otherwise, keep the task close to the wakeup source * and improve locality if the number of running tasks * would remain below threshold where an imbalance is - * allowed. If there is a real need of migration, - * periodic load balance will take care of it. + * allowed while accounting for the possibility the + * task is pinned to a subset of CPUs. If there is a + * real need of migration, periodic load balance will + * take care of it. */ - if (allow_numa_imbalance(local_sgs.sum_nr_running + 1, sd->imb_numa_nr)) + if (p->nr_cpus_allowed != NR_CPUS) { + struct cpumask *cpus = this_cpu_cpumask_var_ptr(select_idle_mask); + + cpumask_and(cpus, sched_group_span(local), p->cpus_ptr); + imb_numa_nr = min(cpumask_weight(cpus), sd->imb_numa_nr); + } + + imbalance = abs(local_sgs.idle_cpus - idlest_sgs.idle_cpus); + if (!adjust_numa_imbalance(imbalance, + local_sgs.sum_nr_running + 1, + imb_numa_nr)) { return NULL; + } } +#endif /* CONFIG_NUMA */ /* * Select group with highest number of idle CPUs. We could also @@ -9293,24 +9295,6 @@ next_group: } } -#define NUMA_IMBALANCE_MIN 2 - -static inline long adjust_numa_imbalance(int imbalance, - int dst_running, int imb_numa_nr) -{ - if (!allow_numa_imbalance(dst_running, imb_numa_nr)) - return imbalance; - - /* - * Allow a small imbalance based on a simple pair of communicating - * tasks that remain local when the destination is lightly loaded. - */ - if (imbalance <= NUMA_IMBALANCE_MIN) - return 0; - - return imbalance; -} - /** * calculate_imbalance - Calculate the amount of imbalance present within the * groups of a given sched_domain during load balance. @@ -9395,7 +9379,7 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s */ env->migration_type = migrate_task; lsub_positive(&nr_diff, local->sum_nr_running); - env->imbalance = nr_diff >> 1; + env->imbalance = nr_diff; } else { /* @@ -9403,15 +9387,21 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s * idle cpus. */ env->migration_type = migrate_task; - env->imbalance = max_t(long, 0, (local->idle_cpus - - busiest->idle_cpus) >> 1); + env->imbalance = max_t(long, 0, + (local->idle_cpus - busiest->idle_cpus)); } +#ifdef CONFIG_NUMA /* Consider allowing a small imbalance between NUMA groups */ if (env->sd->flags & SD_NUMA) { env->imbalance = adjust_numa_imbalance(env->imbalance, - local->sum_nr_running + 1, env->sd->imb_numa_nr); + local->sum_nr_running + 1, + env->sd->imb_numa_nr); } +#endif + + /* Number of tasks to move to restore balance */ + env->imbalance >>= 1; return; } @@ -9834,9 +9824,15 @@ static int should_we_balance(struct lb_env *env) /* * In the newly idle case, we will allow all the CPUs * to do the newly idle load balance. + * + * However, we bail out if we already have tasks or a wakeup pending, + * to optimize wakeup latency. */ - if (env->idle == CPU_NEWLY_IDLE) + if (env->idle == CPU_NEWLY_IDLE) { + if (env->dst_rq->nr_running > 0 || env->dst_rq->ttwu_pending) + return 0; return 1; + } /* Try to find first idle CPU */ for_each_cpu_and(cpu, group_balance_mask(sg), env->cpus) { @@ -11287,9 +11283,13 @@ static inline bool vruntime_normalized(struct task_struct *p) */ static void propagate_entity_cfs_rq(struct sched_entity *se) { - struct cfs_rq *cfs_rq; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + if (cfs_rq_throttled(cfs_rq)) + return; - list_add_leaf_cfs_rq(cfs_rq_of(se)); + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); /* Start to propagate at parent */ se = se->parent; @@ -11297,14 +11297,13 @@ static void propagate_entity_cfs_rq(struct sched_entity *se) for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - if (!cfs_rq_throttled(cfs_rq)){ - update_load_avg(cfs_rq, se, UPDATE_TG); - list_add_leaf_cfs_rq(cfs_rq); - continue; - } + update_load_avg(cfs_rq, se, UPDATE_TG); - if (list_add_leaf_cfs_rq(cfs_rq)) + if (cfs_rq_throttled(cfs_rq)) break; + + if (!throttled_hierarchy(cfs_rq)) + list_add_leaf_cfs_rq(cfs_rq); } } #else diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 47b89a0fc6e5..7b19a72408b1 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2044,7 +2044,6 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_SYNC 0x10 /* Waker goes to sleep after wakeup */ #define WF_MIGRATED 0x20 /* Internal use, task got migrated */ -#define WF_ON_CPU 0x40 /* Wakee is on_cpu */ #ifdef CONFIG_SMP static_assert(WF_EXEC == SD_BALANCE_EXEC); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 05b6c2ad90b9..8739c2a5a54e 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -2316,23 +2316,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att /* * For a single LLC per node, allow an - * imbalance up to 25% of the node. This is an - * arbitrary cutoff based on SMT-2 to balance - * between memory bandwidth and avoiding - * premature sharing of HT resources and SMT-4 - * or SMT-8 *may* benefit from a different - * cutoff. + * imbalance up to 12.5% of the node. This is + * arbitrary cutoff based two factors -- SMT and + * memory channels. For SMT-2, the intent is to + * avoid premature sharing of HT resources but + * SMT-4 or SMT-8 *may* benefit from a different + * cutoff. For memory channels, this is a very + * rough estimate of how many channels may be + * active and is based on recent CPUs with + * many cores. * * For multiple LLCs, allow an imbalance * until multiple tasks would share an LLC * on one node while LLCs on another node - * remain idle. + * remain idle. This assumes that there are + * enough logical CPUs per LLC to avoid SMT + * factors and that there is a correlation + * between LLCs and memory channels. */ nr_llcs = sd->span_weight / child->span_weight; if (nr_llcs == 1) - imb = sd->span_weight >> 2; + imb = sd->span_weight >> 3; else imb = nr_llcs; + imb = max(1U, imb); sd->imb_numa_nr = imb; /* Set span based on the first NUMA domain. */ |