summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorStephen Rothwell <sfr@canb.auug.org.au>2023-05-18 11:27:02 +1000
committerStephen Rothwell <sfr@canb.auug.org.au>2023-05-18 11:27:02 +1000
commit75cac78f189a9ac1eefb4d4ffbcbdfe94a50329a (patch)
tree135c7ea11c62ead1989db1ca2454007844bdc0ed
parent82572f0f84a0b6bf96774a1424ea7a12be27b27e (diff)
parent2ef269ef1ac006acf974793d975539244d77b28f (diff)
downloadlinux-next-75cac78f189a9ac1eefb4d4ffbcbdfe94a50329a.tar.gz
Merge branch 'for-next' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup.git
-rw-r--r--include/linux/cpuset.h12
-rw-r--r--include/linux/sched.h4
-rw-r--r--kernel/cgroup/cgroup.c4
-rw-r--r--kernel/cgroup/cpuset.c244
-rw-r--r--kernel/sched/core.c41
-rw-r--r--kernel/sched/deadline.c67
-rw-r--r--kernel/sched/sched.h2
7 files changed, 246 insertions, 128 deletions
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 980b76a1237e..d629094fac6e 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -71,8 +71,10 @@ extern void cpuset_init_smp(void);
extern void cpuset_force_rebuild(void);
extern void cpuset_update_active_cpus(void);
extern void cpuset_wait_for_hotplug(void);
-extern void cpuset_read_lock(void);
-extern void cpuset_read_unlock(void);
+extern void inc_dl_tasks_cs(struct task_struct *task);
+extern void dec_dl_tasks_cs(struct task_struct *task);
+extern void cpuset_lock(void);
+extern void cpuset_unlock(void);
extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask);
extern bool cpuset_cpus_allowed_fallback(struct task_struct *p);
extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
@@ -189,8 +191,10 @@ static inline void cpuset_update_active_cpus(void)
static inline void cpuset_wait_for_hotplug(void) { }
-static inline void cpuset_read_lock(void) { }
-static inline void cpuset_read_unlock(void) { }
+static inline void inc_dl_tasks_cs(struct task_struct *task) { }
+static inline void dec_dl_tasks_cs(struct task_struct *task) { }
+static inline void cpuset_lock(void) { }
+static inline void cpuset_unlock(void) { }
static inline void cpuset_cpus_allowed(struct task_struct *p,
struct cpumask *mask)
diff --git a/include/linux/sched.h b/include/linux/sched.h
index eed5d65b8d1f..2553918f0b61 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1852,7 +1852,9 @@ current_restore_flags(unsigned long orig_flags, unsigned long flags)
}
extern int cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int task_can_attach(struct task_struct *p, const struct cpumask *cs_effective_cpus);
+extern int task_can_attach(struct task_struct *p);
+extern int dl_bw_alloc(int cpu, u64 dl_bw);
+extern void dl_bw_free(int cpu, u64 dl_bw);
#ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask);
extern int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask);
diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
index b26ae200abef..306385a7234b 100644
--- a/kernel/cgroup/cgroup.c
+++ b/kernel/cgroup/cgroup.c
@@ -57,6 +57,7 @@
#include <linux/file.h>
#include <linux/fs_parser.h>
#include <linux/sched/cputime.h>
+#include <linux/sched/deadline.h>
#include <linux/psi.h>
#include <net/sock.h>
@@ -6695,6 +6696,9 @@ void cgroup_exit(struct task_struct *tsk)
list_add_tail(&tsk->cg_list, &cset->dying_tasks);
cset->nr_tasks--;
+ if (dl_task(tsk))
+ dec_dl_tasks_cs(tsk);
+
WARN_ON_ONCE(cgroup_task_frozen(tsk));
if (unlikely(!(tsk->flags & PF_KTHREAD) &&
test_bit(CGRP_FREEZE, &task_dfl_cgroup(tsk)->flags)))
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
index e4ca2dd2b764..2c76fcd9f0bc 100644
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -193,6 +193,14 @@ struct cpuset {
int use_parent_ecpus;
int child_ecpus_count;
+ /*
+ * number of SCHED_DEADLINE tasks attached to this cpuset, so that we
+ * know when to rebuild associated root domain bandwidth information.
+ */
+ int nr_deadline_tasks;
+ int nr_migrate_dl_tasks;
+ u64 sum_migrate_dl_bw;
+
/* Invalid partition error code, not lock protected */
enum prs_errcode prs_err;
@@ -245,6 +253,20 @@ static inline struct cpuset *parent_cs(struct cpuset *cs)
return css_cs(cs->css.parent);
}
+void inc_dl_tasks_cs(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ cs->nr_deadline_tasks++;
+}
+
+void dec_dl_tasks_cs(struct task_struct *p)
+{
+ struct cpuset *cs = task_cs(p);
+
+ cs->nr_deadline_tasks--;
+}
+
/* bits in struct cpuset flags field */
typedef enum {
CS_ONLINE,
@@ -366,22 +388,23 @@ static struct cpuset top_cpuset = {
if (is_cpuset_online(((des_cs) = css_cs((pos_css)))))
/*
- * There are two global locks guarding cpuset structures - cpuset_rwsem and
+ * There are two global locks guarding cpuset structures - cpuset_mutex and
* callback_lock. We also require taking task_lock() when dereferencing a
* task's cpuset pointer. See "The task_lock() exception", at the end of this
- * comment. The cpuset code uses only cpuset_rwsem write lock. Other
- * kernel subsystems can use cpuset_read_lock()/cpuset_read_unlock() to
- * prevent change to cpuset structures.
+ * comment. The cpuset code uses only cpuset_mutex. Other kernel subsystems
+ * can use cpuset_lock()/cpuset_unlock() to prevent change to cpuset
+ * structures. Note that cpuset_mutex needs to be a mutex as it is used in
+ * paths that rely on priority inheritance (e.g. scheduler - on RT) for
+ * correctness.
*
* A task must hold both locks to modify cpusets. If a task holds
- * cpuset_rwsem, it blocks others wanting that rwsem, ensuring that it
- * is the only task able to also acquire callback_lock and be able to
- * modify cpusets. It can perform various checks on the cpuset structure
- * first, knowing nothing will change. It can also allocate memory while
- * just holding cpuset_rwsem. While it is performing these checks, various
- * callback routines can briefly acquire callback_lock to query cpusets.
- * Once it is ready to make the changes, it takes callback_lock, blocking
- * everyone else.
+ * cpuset_mutex, it blocks others, ensuring that it is the only task able to
+ * also acquire callback_lock and be able to modify cpusets. It can perform
+ * various checks on the cpuset structure first, knowing nothing will change.
+ * It can also allocate memory while just holding cpuset_mutex. While it is
+ * performing these checks, various callback routines can briefly acquire
+ * callback_lock to query cpusets. Once it is ready to make the changes, it
+ * takes callback_lock, blocking everyone else.
*
* Calls to the kernel memory allocator can not be made while holding
* callback_lock, as that would risk double tripping on callback_lock
@@ -403,16 +426,16 @@ static struct cpuset top_cpuset = {
* guidelines for accessing subsystem state in kernel/cgroup.c
*/
-DEFINE_STATIC_PERCPU_RWSEM(cpuset_rwsem);
+static DEFINE_MUTEX(cpuset_mutex);
-void cpuset_read_lock(void)
+void cpuset_lock(void)
{
- percpu_down_read(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
}
-void cpuset_read_unlock(void)
+void cpuset_unlock(void)
{
- percpu_up_read(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
static DEFINE_SPINLOCK(callback_lock);
@@ -496,7 +519,7 @@ static inline bool partition_is_populated(struct cpuset *cs,
* One way or another, we guarantee to return some non-empty subset
* of cpu_online_mask.
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_cpus(struct task_struct *tsk,
struct cpumask *pmask)
@@ -538,7 +561,7 @@ out_unlock:
* One way or another, we guarantee to return some non-empty subset
* of node_states[N_MEMORY].
*
- * Call with callback_lock or cpuset_rwsem held.
+ * Call with callback_lock or cpuset_mutex held.
*/
static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
{
@@ -550,7 +573,7 @@ static void guarantee_online_mems(struct cpuset *cs, nodemask_t *pmask)
/*
* update task's spread flag if cpuset's page/slab spread flag is set
*
- * Call with callback_lock or cpuset_rwsem held. The check can be skipped
+ * Call with callback_lock or cpuset_mutex held. The check can be skipped
* if on default hierarchy.
*/
static void cpuset_update_task_spread_flags(struct cpuset *cs,
@@ -575,7 +598,7 @@ static void cpuset_update_task_spread_flags(struct cpuset *cs,
*
* One cpuset is a subset of another if all its allowed CPUs and
* Memory Nodes are a subset of the other, and its exclusive flags
- * are only set if the other's are set. Call holding cpuset_rwsem.
+ * are only set if the other's are set. Call holding cpuset_mutex.
*/
static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
@@ -713,7 +736,7 @@ out:
* If we replaced the flag and mask values of the current cpuset
* (cur) with those values in the trial cpuset (trial), would
* our various subset and exclusive rules still be valid? Presumes
- * cpuset_rwsem held.
+ * cpuset_mutex held.
*
* 'cur' is the address of an actual, in-use cpuset. Operations
* such as list traversal that depend on the actual address of the
@@ -829,7 +852,7 @@ static void update_domain_attr_tree(struct sched_domain_attr *dattr,
rcu_read_unlock();
}
-/* Must be called with cpuset_rwsem held. */
+/* Must be called with cpuset_mutex held. */
static inline int nr_cpusets(void)
{
/* jump label reference count + the top-level cpuset */
@@ -855,7 +878,7 @@ static inline int nr_cpusets(void)
* domains when operating in the severe memory shortage situations
* that could cause allocation failures below.
*
- * Must be called with cpuset_rwsem held.
+ * Must be called with cpuset_mutex held.
*
* The three key local variables below are:
* cp - cpuset pointer, used (together with pos_css) to perform a
@@ -1066,11 +1089,14 @@ done:
return ndoms;
}
-static void update_tasks_root_domain(struct cpuset *cs)
+static void dl_update_tasks_root_domain(struct cpuset *cs)
{
struct css_task_iter it;
struct task_struct *task;
+ if (cs->nr_deadline_tasks == 0)
+ return;
+
css_task_iter_start(&cs->css, 0, &it);
while ((task = css_task_iter_next(&it)))
@@ -1079,12 +1105,12 @@ static void update_tasks_root_domain(struct cpuset *cs)
css_task_iter_end(&it);
}
-static void rebuild_root_domains(void)
+static void dl_rebuild_rd_accounting(void)
{
struct cpuset *cs = NULL;
struct cgroup_subsys_state *pos_css;
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
lockdep_assert_cpus_held();
lockdep_assert_held(&sched_domains_mutex);
@@ -1107,7 +1133,7 @@ static void rebuild_root_domains(void)
rcu_read_unlock();
- update_tasks_root_domain(cs);
+ dl_update_tasks_root_domain(cs);
rcu_read_lock();
css_put(&cs->css);
@@ -1121,7 +1147,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
{
mutex_lock(&sched_domains_mutex);
partition_sched_domains_locked(ndoms_new, doms_new, dattr_new);
- rebuild_root_domains();
+ dl_rebuild_rd_accounting();
mutex_unlock(&sched_domains_mutex);
}
@@ -1134,7 +1160,7 @@ partition_and_rebuild_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
* 'cpus' is removed, then call this routine to rebuild the
* scheduler's dynamic sched domains.
*
- * Call with cpuset_rwsem held. Takes cpus_read_lock().
+ * Call with cpuset_mutex held. Takes cpus_read_lock().
*/
static void rebuild_sched_domains_locked(void)
{
@@ -1145,7 +1171,7 @@ static void rebuild_sched_domains_locked(void)
int ndoms;
lockdep_assert_cpus_held();
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
/*
* If we have raced with CPU hotplug, return early to avoid
@@ -1196,9 +1222,9 @@ static void rebuild_sched_domains_locked(void)
void rebuild_sched_domains(void)
{
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
rebuild_sched_domains_locked();
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
}
@@ -1208,7 +1234,7 @@ void rebuild_sched_domains(void)
* @new_cpus: the temp variable for the new effective_cpus mask
*
* Iterate through each task of @cs updating its cpus_allowed to the
- * effective cpuset's. As this function is called with cpuset_rwsem held,
+ * effective cpuset's. As this function is called with cpuset_mutex held,
* cpuset membership stays stable. For top_cpuset, task_cpu_possible_mask()
* is used instead of effective_cpus to make sure all offline CPUs are also
* included as hotplug code won't update cpumasks for tasks in top_cpuset.
@@ -1322,7 +1348,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
int old_prs, new_prs;
int part_error = PERR_NONE; /* Partition error? */
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
/*
* The parent must be a partition root.
@@ -1545,7 +1571,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cs, int cmd,
*
* On legacy hierarchy, effective_cpus will be the same with cpu_allowed.
*
- * Called with cpuset_rwsem held
+ * Called with cpuset_mutex held
*/
static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp,
bool force)
@@ -1705,7 +1731,7 @@ static void update_sibling_cpumasks(struct cpuset *parent, struct cpuset *cs,
struct cpuset *sibling;
struct cgroup_subsys_state *pos_css;
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
/*
* Check all its siblings and call update_cpumasks_hier()
@@ -1955,12 +1981,12 @@ static void *cpuset_being_rebound;
* @cs: the cpuset in which each task's mems_allowed mask needs to be changed
*
* Iterate through each task of @cs updating its mems_allowed to the
- * effective cpuset's. As this function is called with cpuset_rwsem held,
+ * effective cpuset's. As this function is called with cpuset_mutex held,
* cpuset membership stays stable.
*/
static void update_tasks_nodemask(struct cpuset *cs)
{
- static nodemask_t newmems; /* protected by cpuset_rwsem */
+ static nodemask_t newmems; /* protected by cpuset_mutex */
struct css_task_iter it;
struct task_struct *task;
@@ -1973,7 +1999,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
* take while holding tasklist_lock. Forks can happen - the
* mpol_dup() cpuset_being_rebound check will catch such forks,
* and rebind their vma mempolicies too. Because we still hold
- * the global cpuset_rwsem, we know that no other rebind effort
+ * the global cpuset_mutex, we know that no other rebind effort
* will be contending for the global variable cpuset_being_rebound.
* It's ok if we rebind the same mm twice; mpol_rebind_mm()
* is idempotent. Also migrate pages in each mm to new nodes.
@@ -2019,7 +2045,7 @@ static void update_tasks_nodemask(struct cpuset *cs)
*
* On legacy hierarchy, effective_mems will be the same with mems_allowed.
*
- * Called with cpuset_rwsem held
+ * Called with cpuset_mutex held
*/
static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
{
@@ -2072,7 +2098,7 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems)
* mempolicies and if the cpuset is marked 'memory_migrate',
* migrate the tasks pages to the new memory.
*
- * Call with cpuset_rwsem held. May take callback_lock during call.
+ * Call with cpuset_mutex held. May take callback_lock during call.
* Will take tasklist_lock, scan tasklist for tasks in cpuset cs,
* lock each such tasks mm->mmap_lock, scan its vma's and rebind
* their mempolicies to the cpusets new mems_allowed.
@@ -2164,7 +2190,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val)
* @cs: the cpuset in which each task's spread flags needs to be changed
*
* Iterate through each task of @cs updating its spread flags. As this
- * function is called with cpuset_rwsem held, cpuset membership stays
+ * function is called with cpuset_mutex held, cpuset membership stays
* stable.
*/
static void update_tasks_flags(struct cpuset *cs)
@@ -2184,7 +2210,7 @@ static void update_tasks_flags(struct cpuset *cs)
* cs: the cpuset to update
* turning_on: whether the flag is being set or cleared
*
- * Call with cpuset_rwsem held.
+ * Call with cpuset_mutex held.
*/
static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs,
@@ -2234,7 +2260,7 @@ out:
* @new_prs: new partition root state
* Return: 0 if successful, != 0 if error
*
- * Call with cpuset_rwsem held.
+ * Call with cpuset_mutex held.
*/
static int update_prstate(struct cpuset *cs, int new_prs)
{
@@ -2472,19 +2498,26 @@ static int cpuset_can_attach_check(struct cpuset *cs)
return 0;
}
-/* Called by cgroups to determine if a cpuset is usable; cpuset_rwsem held */
+static void reset_migrate_dl_data(struct cpuset *cs)
+{
+ cs->nr_migrate_dl_tasks = 0;
+ cs->sum_migrate_dl_bw = 0;
+}
+
+/* Called by cgroups to determine if a cpuset is usable; cpuset_mutex held */
static int cpuset_can_attach(struct cgroup_taskset *tset)
{
struct cgroup_subsys_state *css;
- struct cpuset *cs;
+ struct cpuset *cs, *oldcs;
struct task_struct *task;
int ret;
/* used later by cpuset_attach() */
cpuset_attach_old_cs = task_cs(cgroup_taskset_first(tset, &css));
+ oldcs = cpuset_attach_old_cs;
cs = css_cs(css);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/* Check to see if task is allowed in the cpuset */
ret = cpuset_can_attach_check(cs);
@@ -2492,21 +2525,46 @@ static int cpuset_can_attach(struct cgroup_taskset *tset)
goto out_unlock;
cgroup_taskset_for_each(task, css, tset) {
- ret = task_can_attach(task, cs->effective_cpus);
+ ret = task_can_attach(task);
if (ret)
goto out_unlock;
ret = security_task_setscheduler(task);
if (ret)
goto out_unlock;
+
+ if (dl_task(task)) {
+ cs->nr_migrate_dl_tasks++;
+ cs->sum_migrate_dl_bw += task->dl.dl_bw;
+ }
}
+ if (!cs->nr_migrate_dl_tasks)
+ goto out_success;
+
+ if (!cpumask_intersects(oldcs->effective_cpus, cs->effective_cpus)) {
+ int cpu = cpumask_any_and(cpu_active_mask, cs->effective_cpus);
+
+ if (unlikely(cpu >= nr_cpu_ids)) {
+ reset_migrate_dl_data(cs);
+ ret = -EINVAL;
+ goto out_unlock;
+ }
+
+ ret = dl_bw_alloc(cpu, cs->sum_migrate_dl_bw);
+ if (ret) {
+ reset_migrate_dl_data(cs);
+ goto out_unlock;
+ }
+ }
+
+out_success:
/*
* Mark attach is in progress. This makes validate_change() fail
* changes which zero cpus/mems_allowed.
*/
cs->attach_in_progress++;
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
return ret;
}
@@ -2518,15 +2576,23 @@ static void cpuset_cancel_attach(struct cgroup_taskset *tset)
cgroup_taskset_first(tset, &css);
cs = css_cs(css);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+
+ if (cs->nr_migrate_dl_tasks) {
+ int cpu = cpumask_any(cs->effective_cpus);
+
+ dl_bw_free(cpu, cs->sum_migrate_dl_bw);
+ reset_migrate_dl_data(cs);
+ }
+
+ mutex_unlock(&cpuset_mutex);
}
/*
- * Protected by cpuset_rwsem. cpus_attach is used only by cpuset_attach_task()
+ * Protected by cpuset_mutex. cpus_attach is used only by cpuset_attach_task()
* but we can't allocate it dynamically there. Define it global and
* allocate from cpuset_init().
*/
@@ -2535,7 +2601,7 @@ static nodemask_t cpuset_attach_nodemask_to;
static void cpuset_attach_task(struct cpuset *cs, struct task_struct *task)
{
- percpu_rwsem_assert_held(&cpuset_rwsem);
+ lockdep_assert_held(&cpuset_mutex);
if (cs != &top_cpuset)
guarantee_online_cpus(task, cpus_attach);
@@ -2565,7 +2631,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
cs = css_cs(css);
lockdep_assert_cpus_held(); /* see cgroup_attach_lock() */
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
cpus_updated = !cpumask_equal(cs->effective_cpus,
oldcs->effective_cpus);
mems_updated = !nodes_equal(cs->effective_mems, oldcs->effective_mems);
@@ -2622,11 +2688,17 @@ static void cpuset_attach(struct cgroup_taskset *tset)
out:
cs->old_mems_allowed = cpuset_attach_nodemask_to;
+ if (cs->nr_migrate_dl_tasks) {
+ cs->nr_deadline_tasks += cs->nr_migrate_dl_tasks;
+ oldcs->nr_deadline_tasks -= cs->nr_migrate_dl_tasks;
+ reset_migrate_dl_data(cs);
+ }
+
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/* The various types of files and directories in a cpuset file system */
@@ -2658,7 +2730,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
int retval = 0;
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs)) {
retval = -ENODEV;
goto out_unlock;
@@ -2694,7 +2766,7 @@ static int cpuset_write_u64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
return retval;
}
@@ -2707,7 +2779,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
int retval = -ENODEV;
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2720,7 +2792,7 @@ static int cpuset_write_s64(struct cgroup_subsys_state *css, struct cftype *cft,
break;
}
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
return retval;
}
@@ -2753,7 +2825,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
* operation like this one can lead to a deadlock through kernfs
* active_ref protection. Let's break the protection. Losing the
* protection is okay as we check whether @cs is online after
- * grabbing cpuset_rwsem anyway. This only happens on the legacy
+ * grabbing cpuset_mutex anyway. This only happens on the legacy
* hierarchies.
*/
css_get(&cs->css);
@@ -2761,7 +2833,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
flush_work(&cpuset_hotplug_work);
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
@@ -2785,7 +2857,7 @@ static ssize_t cpuset_write_resmask(struct kernfs_open_file *of,
free_cpuset(trialcs);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
kernfs_unbreak_active_protection(of->kn);
css_put(&cs->css);
@@ -2933,13 +3005,13 @@ static ssize_t sched_partition_write(struct kernfs_open_file *of, char *buf,
css_get(&cs->css);
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (!is_cpuset_online(cs))
goto out_unlock;
retval = update_prstate(cs, val);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
css_put(&cs->css);
return retval ?: nbytes;
@@ -3156,7 +3228,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
return 0;
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
set_bit(CS_ONLINE, &cs->flags);
if (is_spread_page(parent))
@@ -3207,7 +3279,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
spin_unlock_irq(&callback_lock);
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
return 0;
}
@@ -3228,7 +3300,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
struct cpuset *cs = css_cs(css);
cpus_read_lock();
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
if (is_partition_valid(cs))
update_prstate(cs, 0);
@@ -3247,7 +3319,7 @@ static void cpuset_css_offline(struct cgroup_subsys_state *css)
cpuset_dec();
clear_bit(CS_ONLINE, &cs->flags);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
cpus_read_unlock();
}
@@ -3260,7 +3332,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
static void cpuset_bind(struct cgroup_subsys_state *root_css)
{
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
spin_lock_irq(&callback_lock);
if (is_in_v2_mode()) {
@@ -3273,7 +3345,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css)
}
spin_unlock_irq(&callback_lock);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/*
@@ -3294,14 +3366,14 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
return 0;
lockdep_assert_held(&cgroup_mutex);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/* Check to see if task is allowed in the cpuset */
ret = cpuset_can_attach_check(cs);
if (ret)
goto out_unlock;
- ret = task_can_attach(task, cs->effective_cpus);
+ ret = task_can_attach(task);
if (ret)
goto out_unlock;
@@ -3315,7 +3387,7 @@ static int cpuset_can_fork(struct task_struct *task, struct css_set *cset)
*/
cs->attach_in_progress++;
out_unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
return ret;
}
@@ -3331,11 +3403,11 @@ static void cpuset_cancel_fork(struct task_struct *task, struct css_set *cset)
if (same_cs)
return;
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
cs->attach_in_progress--;
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/*
@@ -3363,7 +3435,7 @@ static void cpuset_fork(struct task_struct *task)
}
/* CLONE_INTO_CGROUP */
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
cpuset_attach_task(cs, task);
@@ -3371,7 +3443,7 @@ static void cpuset_fork(struct task_struct *task)
if (!cs->attach_in_progress)
wake_up(&cpuset_attach_wq);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
struct cgroup_subsys cpuset_cgrp_subsys = {
@@ -3472,7 +3544,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
is_empty = cpumask_empty(cs->cpus_allowed) ||
nodes_empty(cs->mems_allowed);
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
/*
* Move tasks to the nearest ancestor with execution resources,
@@ -3482,7 +3554,7 @@ hotplug_update_tasks_legacy(struct cpuset *cs,
if (is_empty)
remove_tasks_in_empty_cpuset(cs);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
}
static void
@@ -3533,14 +3605,14 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp)
retry:
wait_event(cpuset_attach_wq, cs->attach_in_progress == 0);
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/*
* We have raced with task attaching. We wait until attaching
* is finished, so we won't attach a task to an empty cpuset.
*/
if (cs->attach_in_progress) {
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
goto retry;
}
@@ -3637,7 +3709,7 @@ update_tasks:
cpus_updated, mems_updated);
unlock:
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
}
/**
@@ -3667,7 +3739,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
if (on_dfl && !alloc_cpumasks(NULL, &tmp))
ptmp = &tmp;
- percpu_down_write(&cpuset_rwsem);
+ mutex_lock(&cpuset_mutex);
/* fetch the available cpus/mems and find out which changed how */
cpumask_copy(&new_cpus, cpu_active_mask);
@@ -3724,7 +3796,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work)
update_tasks_nodemask(&top_cpuset);
}
- percpu_up_write(&cpuset_rwsem);
+ mutex_unlock(&cpuset_mutex);
/* if cpus or mems changed, we need to propagate to descendants */
if (cpus_updated || mems_updated) {
@@ -4155,7 +4227,7 @@ void __cpuset_memory_pressure_bump(void)
* - Used for /proc/<pid>/cpuset.
* - No need to task_lock(tsk) on this tsk->cpuset reference, as it
* doesn't really matter if tsk->cpuset changes after we read it,
- * and we take cpuset_rwsem, keeping cpuset_attach() from changing it
+ * and we take cpuset_mutex, keeping cpuset_attach() from changing it
* anyway.
*/
int proc_cpuset_show(struct seq_file *m, struct pid_namespace *ns,
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index a68d1276bab0..90005760003f 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7590,6 +7590,7 @@ static int __sched_setscheduler(struct task_struct *p,
int reset_on_fork;
int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
struct rq *rq;
+ bool cpuset_locked = false;
/* The pi code expects interrupts enabled */
BUG_ON(pi && in_interrupt());
@@ -7639,8 +7640,14 @@ recheck:
return retval;
}
- if (pi)
- cpuset_read_lock();
+ /*
+ * SCHED_DEADLINE bandwidth accounting relies on stable cpusets
+ * information.
+ */
+ if (dl_policy(policy) || dl_policy(p->policy)) {
+ cpuset_locked = true;
+ cpuset_lock();
+ }
/*
* Make sure no PI-waiters arrive (or leave) while we are
@@ -7716,8 +7723,8 @@ change:
if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
policy = oldpolicy = -1;
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
goto recheck;
}
@@ -7784,7 +7791,8 @@ change:
task_rq_unlock(rq, p, &rf);
if (pi) {
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
rt_mutex_adjust_pi(p);
}
@@ -7796,8 +7804,8 @@ change:
unlock:
task_rq_unlock(rq, p, &rf);
- if (pi)
- cpuset_read_unlock();
+ if (cpuset_locked)
+ cpuset_unlock();
return retval;
}
@@ -9286,8 +9294,7 @@ int cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-int task_can_attach(struct task_struct *p,
- const struct cpumask *cs_effective_cpus)
+int task_can_attach(struct task_struct *p)
{
int ret = 0;
@@ -9300,21 +9307,9 @@ int task_can_attach(struct task_struct *p,
* success of set_cpus_allowed_ptr() on all attached tasks
* before cpus_mask may be changed.
*/
- if (p->flags & PF_NO_SETAFFINITY) {
+ if (p->flags & PF_NO_SETAFFINITY)
ret = -EINVAL;
- goto out;
- }
- if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
- cs_effective_cpus)) {
- int cpu = cpumask_any_and(cpu_active_mask, cs_effective_cpus);
-
- if (unlikely(cpu >= nr_cpu_ids))
- return -EINVAL;
- ret = dl_cpu_busy(cpu, p);
- }
-
-out:
return ret;
}
@@ -9596,7 +9591,7 @@ static void cpuset_cpu_active(void)
static int cpuset_cpu_inactive(unsigned int cpu)
{
if (!cpuhp_tasks_frozen) {
- int ret = dl_cpu_busy(cpu, NULL);
+ int ret = dl_bw_check_overflow(cpu);
if (ret)
return ret;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 5a9a4b81c972..166c3e6eae61 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -16,6 +16,8 @@
* Fabio Checconi <fchecconi@gmail.com>
*/
+#include <linux/cpuset.h>
+
/*
* Default limits for DL period; on the top end we guard against small util
* tasks still getting ridiculously long effective runtimes, on the bottom end we
@@ -2596,6 +2598,12 @@ static void switched_from_dl(struct rq *rq, struct task_struct *p)
if (task_on_rq_queued(p) && p->dl.dl_runtime)
task_non_contending(p);
+ /*
+ * In case a task is setscheduled out from SCHED_DEADLINE we need to
+ * keep track of that on its cpuset (for correct bandwidth tracking).
+ */
+ dec_dl_tasks_cs(p);
+
if (!task_on_rq_queued(p)) {
/*
* Inactive timer is armed. However, p is leaving DEADLINE and
@@ -2636,6 +2644,12 @@ static void switched_to_dl(struct rq *rq, struct task_struct *p)
if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1)
put_task_struct(p);
+ /*
+ * In case a task is setscheduled to SCHED_DEADLINE we need to keep
+ * track of that on its cpuset (for correct bandwidth tracking).
+ */
+ inc_dl_tasks_cs(p);
+
/* If p is not queued we will update its parameters at next wakeup. */
if (!task_on_rq_queued(p)) {
add_rq_bw(&p->dl, &rq->dl);
@@ -3044,26 +3058,38 @@ int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur,
return ret;
}
-int dl_cpu_busy(int cpu, struct task_struct *p)
+enum dl_bw_request {
+ dl_bw_req_check_overflow = 0,
+ dl_bw_req_alloc,
+ dl_bw_req_free
+};
+
+static int dl_bw_manage(enum dl_bw_request req, int cpu, u64 dl_bw)
{
- unsigned long flags, cap;
+ unsigned long flags;
struct dl_bw *dl_b;
- bool overflow;
+ bool overflow = 0;
rcu_read_lock_sched();
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- cap = dl_bw_capacity(cpu);
- overflow = __dl_overflow(dl_b, cap, 0, p ? p->dl.dl_bw : 0);
- if (!overflow && p) {
- /*
- * We reserve space for this task in the destination
- * root_domain, as we can't fail after this point.
- * We will free resources in the source root_domain
- * later on (see set_cpus_allowed_dl()).
- */
- __dl_add(dl_b, p->dl.dl_bw, dl_bw_cpus(cpu));
+ if (req == dl_bw_req_free) {
+ __dl_sub(dl_b, dl_bw, dl_bw_cpus(cpu));
+ } else {
+ unsigned long cap = dl_bw_capacity(cpu);
+
+ overflow = __dl_overflow(dl_b, cap, 0, dl_bw);
+
+ if (req == dl_bw_req_alloc && !overflow) {
+ /*
+ * We reserve space in the destination
+ * root_domain, as we can't fail after this point.
+ * We will free resources in the source root_domain
+ * later on (see set_cpus_allowed_dl()).
+ */
+ __dl_add(dl_b, dl_bw, dl_bw_cpus(cpu));
+ }
}
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
@@ -3071,6 +3097,21 @@ int dl_cpu_busy(int cpu, struct task_struct *p)
return overflow ? -EBUSY : 0;
}
+
+int dl_bw_check_overflow(int cpu)
+{
+ return dl_bw_manage(dl_bw_req_check_overflow, cpu, 0);
+}
+
+int dl_bw_alloc(int cpu, u64 dl_bw)
+{
+ return dl_bw_manage(dl_bw_req_alloc, cpu, dl_bw);
+}
+
+void dl_bw_free(int cpu, u64 dl_bw)
+{
+ dl_bw_manage(dl_bw_req_free, cpu, dl_bw);
+}
#endif
#ifdef CONFIG_SCHED_DEBUG
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 678446251c35..1704763897d0 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -330,7 +330,7 @@ extern void __getparam_dl(struct task_struct *p, struct sched_attr *attr);
extern bool __checkparam_dl(const struct sched_attr *attr);
extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr);
extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial);
-extern int dl_cpu_busy(int cpu, struct task_struct *p);
+extern int dl_bw_check_overflow(int cpu);
#ifdef CONFIG_CGROUP_SCHED