diff options
author | Sebastian Andrzej Siewior <bigeasy@linutronix.de> | 2017-03-28 13:06:54 +0200 |
---|---|---|
committer | Sebastian Andrzej Siewior <bigeasy@linutronix.de> | 2017-03-28 13:06:54 +0200 |
commit | e6a76dc34a7be542641b32770e0ca3e29f507bb9 (patch) | |
tree | 0e10df8c17709cad27af0af4281b5cdd56b2b9c8 | |
parent | 950728f5efcdc960f3274cb3206418d0178c866c (diff) | |
download | linux-rt-e6a76dc34a7be542641b32770e0ca3e29f507bb9.tar.gz |
[ANNOUNCE] v4.9.18-rt14v4.9.18-rt14-patches
Dear RT folks!
I'm pleased to announce the v4.9.18-rt14 patch set.
Changes since v4.9.18-rt13:
- v4.9.11-rt9 had a fix for statically initialized PER_CPU locks. An
issue with nested locks came up which was noticed by the kernel test
robot and fixed by Peter Zijlstra.
- A larger rework of the futex / rtmutex code. In v4.8-rt1 we added a
workaround so we don't de-boost too early in the unlock path. A
small window remained in which the locking thread could de-boost the
unlocking thread. This rework by Peter Zijlstra fixes the issue.
Known issues
- CPU hotplug got a little better but can deadlock.
- The radeon driver. Probably since a change in the driver (or
DRM core) the radeon driver can hang. This problem starts probably
with the v3.18 release.
- gdb. While gdb is following a task it is possible that after a
fork() operation the task is waiting for gdb and gdb waiting
for the task.
The delta patch against v4.9.18-rt13 is appended below and can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/incr/patch-4.9.18-rt13-rt14.patch.xz
You can get this release via the git tree at:
git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v4.9.18-rt14
The RT patch against v4.9.18 can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patch-4.9.18-rt14.patch.xz
The split quilt queue is available at:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patches-4.9.18-rt14.tar.xz
Sebastian
diff --git a/include/linux/smp.h b/include/linux/smp.h
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
extern void __init setup_nr_cpu_ids(void);
extern void __init smp_init(void);
+extern int __boot_cpu_id;
+
+static inline int get_boot_cpu_id(void)
+{
+ return __boot_cpu_id;
+}
+
#else /* !SMP */
static inline void smp_send_stop(void) { }
@@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); }
static inline void smp_init(void) { }
#endif
+static inline int get_boot_cpu_id(void)
+{
+ return 0;
+}
+
#endif /* !SMP */
/*
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -355,12 +355,6 @@ static __always_inline void spin_unlock(spinlock_t *lock)
raw_spin_unlock(&lock->rlock);
}
-static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
-{
- raw_spin_unlock(&lock->rlock);
- return 0;
-}
-
static __always_inline void spin_unlock_bh(spinlock_t *lock)
{
raw_spin_unlock_bh(&lock->rlock);
diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h
--- a/include/linux/spinlock_rt.h
+++ b/include/linux/spinlock_rt.h
@@ -26,7 +26,6 @@ extern void __lockfunc rt_spin_lock(spinlock_t *lock);
extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
-extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
@@ -112,7 +111,6 @@ static inline unsigned long spin_lock_trace_flags(spinlock_t *lock)
#define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
#define spin_unlock(lock) rt_spin_unlock(lock)
-#define spin_unlock_no_deboost(lock) rt_spin_unlock_no_deboost(lock)
#define spin_unlock_bh(lock) \
do { \
diff --git a/kernel/cpu.c b/kernel/cpu.c
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1562,6 +1562,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
#endif /* CONFIG_PM_SLEEP_SMP */
+int __boot_cpu_id;
+
#endif /* CONFIG_SMP */
/* Boot processor state steps */
@@ -2245,6 +2247,10 @@ void __init boot_cpu_init(void)
set_cpu_active(cpu, true);
set_cpu_present(cpu, true);
set_cpu_possible(cpu, true);
+
+#ifdef CONFIG_SMP
+ __boot_cpu_id = cpu;
+#endif
}
/*
diff --git a/kernel/futex.c b/kernel/futex.c
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -800,7 +800,7 @@ static int refill_pi_state_cache(void)
return 0;
}
-static struct futex_pi_state * alloc_pi_state(void)
+static struct futex_pi_state *alloc_pi_state(void)
{
struct futex_pi_state *pi_state = current->pi_state_cache;
@@ -810,6 +810,11 @@ static struct futex_pi_state * alloc_pi_state(void)
return pi_state;
}
+static void get_pi_state(struct futex_pi_state *pi_state)
+{
+ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
+}
+
/*
* Drops a reference to the pi_state object and frees or caches it
* when the last reference is gone.
@@ -854,7 +859,7 @@ static void put_pi_state(struct futex_pi_state *pi_state)
* Look up the task based on what TID userspace gave us.
* We dont trust it.
*/
-static struct task_struct * futex_find_get_task(pid_t pid)
+static struct task_struct *futex_find_get_task(pid_t pid)
{
struct task_struct *p;
@@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
pi_state->owner = NULL;
raw_spin_unlock_irq(&curr->pi_lock);
- rt_mutex_unlock(&pi_state->pi_mutex);
-
+ get_pi_state(pi_state);
spin_unlock(&hb->lock);
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+
raw_spin_lock_irq(&curr->pi_lock);
}
raw_spin_unlock_irq(&curr->pi_lock);
@@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr)
*
* [10] There is no transient state which leaves owner and user space
* TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ * hb -> futex_q, relation
+ * futex_q -> pi_state, relation
+ *
+ * (cannot be raw because hb can contain arbitrary amount
+ * of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ * {uval, pi_state}
+ *
+ * (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ * p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ * pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ * hb->lock
+ * pi_mutex->wait_lock
+ * p->pi_lock
+ *
*/
/*
@@ -980,10 +1020,12 @@ void exit_pi_state_list(struct task_struct *curr)
* the pi_state against the user space value. If correct, attach to
* it.
*/
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_pi_state *pi_state,
struct futex_pi_state **ps)
{
pid_t pid = uval & FUTEX_TID_MASK;
+ int ret, uval2;
/*
* Userspace might have messed up non-PI and PI futexes [3]
@@ -991,9 +1033,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
if (unlikely(!pi_state))
return -EINVAL;
+ /*
+ * We get here with hb->lock held, and having found a
+ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
+ *
+ * The waiter holding a reference on @pi_state also protects against
+ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+ * free pi_state before we can take a reference ourselves.
+ */
WARN_ON(!atomic_read(&pi_state->refcount));
/*
+ * Now that we have a pi_state, we can acquire wait_lock
+ * and do the state validation.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Since {uval, pi_state} is serialized by wait_lock, and our current
+ * uval was read without holding it, it can have changed. Verify it
+ * still is what we expect it to be, otherwise retry the entire
+ * operation.
+ */
+ if (get_futex_value_locked(&uval2, uaddr))
+ goto out_efault;
+
+ if (uval != uval2)
+ goto out_eagain;
+
+ /*
* Handle the owner died case:
*/
if (uval & FUTEX_OWNER_DIED) {
@@ -1008,11 +1080,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* is not 0. Inconsistent state. [5]
*/
if (pid)
- return -EINVAL;
+ goto out_einval;
/*
* Take a ref on the state and return success. [4]
*/
- goto out_state;
+ goto out_attach;
}
/*
@@ -1024,14 +1096,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* Take a ref on the state and return success. [6]
*/
if (!pid)
- goto out_state;
+ goto out_attach;
} else {
/*
* If the owner died bit is not set, then the pi_state
* must have an owner. [7]
*/
if (!pi_state->owner)
- return -EINVAL;
+ goto out_einval;
}
/*
@@ -1040,11 +1112,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
* user space TID. [9/10]
*/
if (pid != task_pid_vnr(pi_state->owner))
- return -EINVAL;
-out_state:
- atomic_inc(&pi_state->refcount);
+ goto out_einval;
+
+out_attach:
+ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
*ps = pi_state;
return 0;
+
+out_einval:
+ ret = -EINVAL;
+ goto out_error;
+
+out_eagain:
+ ret = -EAGAIN;
+ goto out_error;
+
+out_efault:
+ ret = -EFAULT;
+ goto out_error;
+
+out_error:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
/*
@@ -1095,6 +1185,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
/*
* No existing pi state. First waiter. [2]
+ *
+ * This creates pi_state, we have hb->lock held, this means nothing can
+ * observe this state, wait_lock is irrelevant.
*/
pi_state = alloc_pi_state();
@@ -1119,17 +1212,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
return 0;
}
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+ struct futex_hash_bucket *hb,
union futex_key *key, struct futex_pi_state **ps)
{
- struct futex_q *match = futex_top_waiter(hb, key);
+ struct futex_q *top_waiter = futex_top_waiter(hb, key);
/*
* If there is a waiter on that futex, validate it and
* attach to the pi_state when the validation succeeds.
*/
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* We are the first waiter - try to look up the owner based on
@@ -1148,7 +1242,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
return -EFAULT;
- /*If user space value changed, let the caller retry */
+ /* If user space value changed, let the caller retry */
return curval != uval ? -EAGAIN : 0;
}
@@ -1176,7 +1270,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
struct task_struct *task, int set_waiters)
{
u32 uval, newval, vpid = task_pid_vnr(task);
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
/*
@@ -1202,9 +1296,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
* Lookup existing state first. If it exists, try to attach to
* its pi_state.
*/
- match = futex_top_waiter(hb, key);
- if (match)
- return attach_to_pi_state(uval, match->pi_state, ps);
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
/*
* No waiter and user TID is 0. We are here because the
@@ -1290,46 +1384,39 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
* memory barrier is required here to prevent the following
* store to lock_ptr from getting ahead of the plist_del.
*/
- smp_wmb();
- q->lock_ptr = NULL;
+ smp_store_release(&q->lock_ptr, NULL);
}
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
- struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
{
- struct task_struct *new_owner;
- struct futex_pi_state *pi_state = this->pi_state;
u32 uninitialized_var(curval), newval;
+ struct task_struct *new_owner;
+ bool deboost = false;
WAKE_Q(wake_q);
WAKE_Q(wake_sleeper_q);
- bool deboost;
int ret = 0;
- if (!pi_state)
- return -EINVAL;
-
- /*
- * If current does not own the pi_state then the futex is
- * inconsistent and user space fiddled with the futex value.
- */
- if (pi_state->owner != current)
- return -EINVAL;
-
- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+ if (WARN_ON_ONCE(!new_owner)) {
+ /*
+ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+ * waiting on the rtmutex or removing itself from the futex
+ * queue.
+ */
+ ret = -EAGAIN;
+ goto out_unlock;
+ }
/*
- * It is possible that the next waiter (the one that brought
- * this owner to the kernel) timed out and is no longer
- * waiting on the lock.
- */
- if (!new_owner)
- new_owner = this->task;
-
- /*
- * We pass it to the next owner. The WAITERS bit is always
- * kept enabled while there is PI state around. We cleanup the
- * owner died bit, because we are the owner.
+ * We pass it to the next owner. The WAITERS bit is always kept
+ * enabled while there is PI state around. We cleanup the owner
+ * died bit, because we are the owner.
*/
newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
@@ -1338,6 +1425,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
ret = -EFAULT;
+
} else if (curval != uval) {
/*
* If a unconditional UNLOCK_PI operation (user space did not
@@ -1350,10 +1438,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else
ret = -EINVAL;
}
- if (ret) {
- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- return ret;
- }
+
+ if (ret)
+ goto out_unlock;
raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
@@ -1366,24 +1453,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
pi_state->owner = new_owner;
raw_spin_unlock(&new_owner->pi_lock);
+ /*
+ * We've updated the uservalue, this unlock cannot fail.
+ */
+ deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
+ &wake_sleeper_q);
+
+out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
- &wake_sleeper_q);
-
- /*
- * First unlock HB so the waiter does not spin on it once he got woken
- * up. Second wake up the waiter before the priority is adjusted. If we
- * deboost first (and lose our higher priority), then the task might get
- * scheduled away before the wake up can take place.
- */
- deboost |= spin_unlock_no_deboost(&hb->lock);
- wake_up_q(&wake_q);
- wake_up_q_sleeper(&wake_sleeper_q);
- if (deboost)
+ if (deboost) {
+ wake_up_q(&wake_q);
+ wake_up_q_sleeper(&wake_sleeper_q);
rt_mutex_adjust_prio(current);
+ }
- return 0;
+ return ret;
}
/*
@@ -1829,7 +1914,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
* If that call succeeds then we have pi_state and an
* initial refcount on it.
*/
- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
}
switch (ret) {
@@ -1912,7 +1997,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags,
* refcount on the pi_state and store the pointer in
* the futex_q object of the waiter.
*/
- atomic_inc(&pi_state->refcount);
+ get_pi_state(pi_state);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
@@ -2022,20 +2107,7 @@ queue_unlock(struct futex_hash_bucket *hb)
hb_waiters_dec(hb);
}
-/**
- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
- * @q: The futex_q to enqueue
- * @hb: The destination hash bucket
- *
- * The hb->lock must be held by the caller, and is released here. A call to
- * queue_me() is typically paired with exactly one call to unqueue_me(). The
- * exceptions involve the PI related operations, which may use unqueue_me_pi()
- * or nothing if the unqueue is done as part of the wake process and the unqueue
- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
- * an example).
- */
-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
- __releases(&hb->lock)
+static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
@@ -2052,6 +2124,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
plist_node_init(&q->list, prio);
plist_add(&q->list, &hb->chain);
q->task = current;
+}
+
+/**
+ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+ * @q: The futex_q to enqueue
+ * @hb: The destination hash bucket
+ *
+ * The hb->lock must be held by the caller, and is released here. A call to
+ * queue_me() is typically paired with exactly one call to unqueue_me(). The
+ * exceptions involve the PI related operations, which may use unqueue_me_pi()
+ * or nothing if the unqueue is done as part of the wake process and the unqueue
+ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+ * an example).
+ */
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ __releases(&hb->lock)
+{
+ __queue_me(q, hb);
spin_unlock(&hb->lock);
}
@@ -2138,10 +2228,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
- struct task_struct *oldowner = pi_state->owner;
u32 uval, uninitialized_var(curval), newval;
+ struct task_struct *oldowner;
int ret;
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ oldowner = pi_state->owner;
/* Owner died? */
if (!pi_state->owner)
newtid |= FUTEX_OWNER_DIED;
@@ -2149,7 +2242,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
/*
* We are here either because we stole the rtmutex from the
* previous highest priority waiter or we are the highest priority
- * waiter but failed to get the rtmutex the first time.
+ * waiter but have failed to get the rtmutex the first time.
+ *
* We have to replace the newowner TID in the user space variable.
* This must be atomic as we have to preserve the owner died bit here.
*
@@ -2157,17 +2251,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* because we can fault here. Imagine swapped out pages or a fork
* that marked all the anonymous memory readonly for cow.
*
- * Modifying pi_state _before_ the user space value would
- * leave the pi_state in an inconsistent state when we fault
- * here, because we need to drop the hash bucket lock to
- * handle the fault. This might be observed in the PID check
- * in lookup_pi_state.
+ * Modifying pi_state _before_ the user space value would leave the
+ * pi_state in an inconsistent state when we fault here, because we
+ * need to drop the locks to handle the fault. This might be observed
+ * in the PID check in lookup_pi_state.
*/
retry:
if (get_futex_value_locked(&uval, uaddr))
goto handle_fault;
- while (1) {
+ for (;;) {
newval = (uval & FUTEX_OWNER_DIED) | newtid;
if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2182,47 +2275,60 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
* itself.
*/
if (pi_state->owner != NULL) {
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
}
pi_state->owner = newowner;
- raw_spin_lock_irq(&newowner->pi_lock);
+ raw_spin_lock(&newowner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
- raw_spin_unlock_irq(&newowner->pi_lock);
+ raw_spin_unlock(&newowner->pi_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
return 0;
/*
- * To handle the page fault we need to drop the hash bucket
- * lock here. That gives the other task (either the highest priority
- * waiter itself or the task which stole the rtmutex) the
- * chance to try the fixup of the pi_state. So once we are
- * back from handling the fault we need to check the pi_state
- * after reacquiring the hash bucket lock and before trying to
- * do another fixup. When the fixup has been done already we
- * simply return.
+ * To handle the page fault we need to drop the locks here. That gives
+ * the other task (either the highest priority waiter itself or the
+ * task which stole the rtmutex) the chance to try the fixup of the
+ * pi_state. So once we are back from handling the fault we need to
+ * check the pi_state after reacquiring the locks and before trying to
+ * do another fixup. When the fixup has been done already we simply
+ * return.
+ *
+ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+ * drop hb->lock since the caller owns the hb -> futex_q relation.
+ * Dropping the pi_mutex->wait_lock requires the state revalidate.
*/
handle_fault:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
spin_unlock(q->lock_ptr);
ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
/*
* Check if someone else fixed it for us:
*/
- if (pi_state->owner != oldowner)
- return 0;
+ if (pi_state->owner != oldowner) {
+ ret = 0;
+ goto out_unlock;
+ }
if (ret)
- return ret;
+ goto out_unlock;
goto retry;
+
+out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
}
static long futex_wait_restart(struct restart_block *restart);
@@ -2244,13 +2350,16 @@ static long futex_wait_restart(struct restart_block *restart);
*/
static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
{
- struct task_struct *owner;
int ret = 0;
if (locked) {
/*
* Got the lock. We might not be the anticipated owner if we
* did a lock-steal - fix up the PI-state in that case:
+ *
+ * We can safely read pi_state->owner without holding wait_lock
+ * because we now own the rt_mutex, only the owner will attempt
+ * to change it.
*/
if (q->pi_state->owner != current)
ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2258,43 +2367,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
}
/*
- * Catch the rare case, where the lock was released when we were on the
- * way back before we locked the hash bucket.
- */
- if (q->pi_state->owner == current) {
- /*
- * Try to get the rt_mutex now. This might fail as some other
- * task acquired the rt_mutex after we removed ourself from the
- * rt_mutex waiters list.
- */
- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
- locked = 1;
- goto out;
- }
-
- /*
- * pi_state is incorrect, some other task did a lock steal and
- * we returned due to timeout or signal without taking the
- * rt_mutex. Too late.
- */
- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
- if (!owner)
- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
- ret = fixup_pi_state_owner(uaddr, q, owner);
- goto out;
- }
-
- /*
* Paranoia check. If we did not take the lock, then we should not be
* the owner of the rt_mutex.
*/
- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
"pi-state %p\n", ret,
q->pi_state->pi_mutex.owner,
q->pi_state->owner);
+ }
out:
return ret ? ret : locked;
@@ -2518,6 +2599,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
ktime_t *time, int trylock)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb;
struct futex_q q = futex_q_init;
int res, ret;
@@ -2570,25 +2653,77 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
}
}
+ WARN_ON(!q.pi_state);
+
/*
* Only actually queue now that the atomic ops are done:
*/
- queue_me(&q, hb);
+ __queue_me(&q, hb);
- WARN_ON(!q.pi_state);
- /*
- * Block on the PI mutex:
- */
- if (!trylock) {
- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
- } else {
- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
+ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
/* Fixup the trylock return value: */
ret = ret ? 0 : -EWOULDBLOCK;
+ goto no_block;
}
+ rt_mutex_init_waiter(&rt_waiter, false);
+
+ /*
+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+ * hold it while doing rt_mutex_start_proxy(), because then it will
+ * include hb->lock in the blocking chain, even through we'll not in
+ * fact hold it while blocking. This will lead it to report -EDEADLK
+ * and BUG when futex_unlock_pi() interleaves with this.
+ *
+ * Therefore acquire wait_lock while holding hb->lock, but drop the
+ * latter before calling rt_mutex_start_proxy_lock(). This still fully
+ * serializes against futex_unlock_pi() as that does the exact same
+ * lock handoff sequence.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
+ /*
+ * the migrate_disable() here disables migration in the in_atomic() fast
+ * path which is enabled again in the following spin_unlock(). We have
+ * one migrate_disable() pending in the slow-path which is reversed
+ * after the raw_spin_unlock_irq() where we leave the atomic context.
+ */
+ migrate_disable();
+
+ spin_unlock(q.lock_ptr);
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
+ migrate_enable();
+
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
+ spin_lock(q.lock_ptr);
+ goto no_block;
+ }
+
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+
+ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
+
spin_lock(q.lock_ptr);
/*
+ * If we failed to acquire the lock (signal/timeout), we must
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
+ *
+ * In particular; it is important that futex_unlock_pi() can not
+ * observe this inconsistency.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+
+no_block:
+ /*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
*/
@@ -2604,12 +2739,19 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
* If fixup_owner() faulted and was unable to handle the fault, unlock
* it and return the fault to userspace.
*/
- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
goto out_put_key;
out_unlock_put_key:
@@ -2646,7 +2788,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
union futex_key key = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb;
- struct futex_q *match;
+ struct futex_q *top_waiter;
int ret;
retry:
@@ -2670,12 +2812,48 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
* all and we at least want to know if user space fiddled
* with the futex value instead of blindly unlocking.
*/
- match = futex_top_waiter(hb, &key);
- if (match) {
- ret = wake_futex_pi(uaddr, uval, match, hb);
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+ struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+ ret = -EINVAL;
+ if (!pi_state)
+ goto out_unlock;
+
/*
- * In case of success wake_futex_pi dropped the hash
- * bucket lock.
+ * If current does not own the pi_state then the futex is
+ * inconsistent and user space fiddled with the futex value.
+ */
+ if (pi_state->owner != current)
+ goto out_unlock;
+
+ get_pi_state(pi_state);
+ /*
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+ * observed.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ /*
+ * Magic trickery for now to make the RT migrate disable
+ * logic happy. The following spin_unlock() happens with
+ * interrupts disabled so the internal migrate_enable()
+ * won't undo the migrate_disable() which was issued when
+ * locking hb->lock.
+ */
+ migrate_disable();
+ spin_unlock(&hb->lock);
+
+ /* Drops pi_state->pi_mutex.wait_lock */
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
+ migrate_enable();
+
+ put_pi_state(pi_state);
+
+ /*
+ * Success, we're done! No tricky corner cases.
*/
if (!ret)
goto out_putkey;
@@ -2690,7 +2868,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
* setting the FUTEX_WAITERS bit. Try again.
*/
if (ret == -EAGAIN) {
- spin_unlock(&hb->lock);
put_futex_key(&key);
goto retry;
}
@@ -2698,7 +2875,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
* wake_futex_pi has detected invalid state. Tell user
* space.
*/
- goto out_unlock;
+ goto out_putkey;
}
/*
@@ -2708,8 +2885,10 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
* preserve the WAITERS bit not the OWNER_DIED one. We are the
* owner.
*/
- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+ spin_unlock(&hb->lock);
goto pi_faulted;
+ }
/*
* If uval has changed, let user space handle it.
@@ -2723,7 +2902,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags)
return ret;
pi_faulted:
- spin_unlock(&hb->lock);
put_futex_key(&key);
ret = fault_in_user_writeable(uaddr);
@@ -2827,6 +3005,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
u32 __user *uaddr2)
{
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
struct futex_hash_bucket *hb, *hb2;
union futex_key key2 = FUTEX_KEY_INIT;
@@ -2944,8 +3123,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
spin_lock(&hb2->lock);
BUG_ON(&hb2->lock != q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
- rt_mutex_unlock(&q.pi_state->pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/*
* Drop the reference to the pi state which
* the requeue_pi() code acquired for us.
@@ -2963,11 +3144,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
*/
WARN_ON(!q.pi_state);
pi_mutex = &q.pi_state->pi_mutex;
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
- debug_rt_mutex_free_waiter(&rt_waiter);
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
spin_lock(&hb2->lock);
BUG_ON(&hb2->lock != q.lock_ptr);
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
+ debug_rt_mutex_free_waiter(&rt_waiter);
/*
* Fixup the pi_state owner and possibly acquire the lock if we
* haven't already.
@@ -2985,13 +3169,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
* the fault, unlock the rt_mutex and return the fault to
* userspace.
*/
- if (ret && rt_mutex_owner(pi_mutex) == current)
- rt_mutex_unlock(pi_mutex);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+ get_pi_state(pi_state);
+ }
/* Unqueue and drop the lock. */
unqueue_me_pi(&q);
}
+ if (pi_state) {
+ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+ put_pi_state(pi_state);
+ }
+
if (ret == -EINTR) {
/*
* We've already been requeued, but cannot restart by calling
diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c
--- a/kernel/locking/rtmutex-debug.c
+++ b/kernel/locking/rtmutex-debug.c
@@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name)
lock->name = name;
}
-void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
-{
-}
-
-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
-{
-}
-
diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h
--- a/kernel/locking/rtmutex-debug.h
+++ b/kernel/locking/rtmutex-debug.h
@@ -9,9 +9,6 @@
* This file contains macros used solely by rtmutex.c. Debug version.
*/
-extern void
-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -978,8 +978,6 @@ static int __try_to_take_rt_mutex(struct rt_mutex *lock,
*/
rt_mutex_set_owner(lock, task);
- rt_mutex_deadlock_account_lock(lock, task);
-
return 1;
}
@@ -998,19 +996,18 @@ static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
migrate_disable();
if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- rt_mutex_deadlock_account_lock(lock, current);
+ return;
else
slowfn(lock, do_mig_dis);
}
-static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
- int (*slowfn)(struct rt_mutex *lock))
+static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
+ void (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
- return 0;
- }
- return slowfn(lock);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
+ else
+ slowfn(lock);
}
#ifdef CONFIG_SMP
/*
@@ -1151,7 +1148,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
/*
* Slow path to release a rt_mutex spin_lock style
*/
-static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
+static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
{
unsigned long flags;
WAKE_Q(wake_q);
@@ -1161,12 +1158,10 @@ static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
debug_rt_mutex_unlock(lock);
- rt_mutex_deadlock_account_unlock(current);
-
if (!rt_mutex_has_waiters(lock)) {
lock->owner = NULL;
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return 0;
+ return;
}
mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
@@ -1177,33 +1172,6 @@ static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
/* Undo pi boosting.when necessary */
rt_mutex_adjust_prio(current);
- return 0;
-}
-
-static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
-{
- unsigned long flags;
- WAKE_Q(wake_q);
- WAKE_Q(wake_sleeper_q);
-
- raw_spin_lock_irqsave(&lock->wait_lock, flags);
-
- debug_rt_mutex_unlock(lock);
-
- rt_mutex_deadlock_account_unlock(current);
-
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- return 0;
- }
-
- mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
-
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
- wake_up_q(&wake_q);
- wake_up_q_sleeper(&wake_sleeper_q);
- return 1;
}
void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
@@ -1258,17 +1226,6 @@ void __lockfunc rt_spin_unlock(spinlock_t *lock)
}
EXPORT_SYMBOL(rt_spin_unlock);
-int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
-{
- int ret;
-
- /* NOTE: we always pass in '1' for nested, for simplicity */
- spin_release(&lock->dep_map, 1, _RET_IP_);
- ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
- migrate_enable();
- return ret;
-}
-
void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
{
rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
@@ -1644,6 +1601,15 @@ void rt_mutex_adjust_pi(struct task_struct *task)
next_lock, NULL, task);
}
+void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
+{
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
+ waiter->savestate = savestate;
+}
+
/**
* __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
* @lock: the rt_mutex to take
@@ -1926,8 +1892,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
debug_rt_mutex_unlock(lock);
- rt_mutex_deadlock_account_unlock(current);
-
/*
* We must be careful here if the fast path is enabled. If we
* have no waiters queued we cannot set owner to NULL here
@@ -1995,12 +1959,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state,
enum rtmutex_chainwalk chwalk,
struct ww_acquire_ctx *ww_ctx))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
- ww_ctx);
+
+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
}
static inline int
@@ -2014,21 +1976,19 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state,
struct ww_acquire_ctx *ww_ctx))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
- return slowfn(lock, state, timeout, chwalk, ww_ctx);
+
+ return slowfn(lock, state, timeout, chwalk, ww_ctx);
}
static inline int
rt_mutex_fasttrylock(struct rt_mutex *lock,
int (*slowfn)(struct rt_mutex *lock))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 1;
- }
+
return slowfn(lock);
}
@@ -2040,20 +2000,19 @@ rt_mutex_fastunlock(struct rt_mutex *lock,
{
WAKE_Q(wake_q);
WAKE_Q(wake_sleeper_q);
+ bool deboost;
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- } else {
- bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
+ deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
- wake_up_q(&wake_q);
- wake_up_q_sleeper(&wake_sleeper_q);
+ wake_up_q(&wake_q);
+ wake_up_q_sleeper(&wake_sleeper_q);
- /* Undo pi boosting if necessary: */
- if (deboost)
- rt_mutex_adjust_prio(current);
- }
+ /* Undo pi boosting if necessary: */
+ if (deboost)
+ rt_mutex_adjust_prio(current);
}
/**
@@ -2087,16 +2046,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
/*
- * Futex variant with full deadlock detection.
+ * Futex variant, must not use fastpath.
*/
-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *timeout)
+int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
{
- might_sleep();
-
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
- RT_MUTEX_FULL_CHAINWALK, NULL,
- rt_mutex_slowlock);
+ return rt_mutex_slowtrylock(lock);
}
/**
@@ -2179,21 +2133,41 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
EXPORT_SYMBOL_GPL(rt_mutex_unlock);
/**
- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
- * @lock: the rt_mutex to be unlocked
- *
- * Returns: true/false indicating whether priority adjustment is
- * required or not.
+ * Futex variant, that since futex variants do not use the fast-path, can be
+ * simple and will not need to retry.
*/
-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh,
- struct wake_q_head *wq_sleeper)
+bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wake_q,
+ struct wake_q_head *wq_sleeper)
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
- return false;
+ lockdep_assert_held(&lock->wait_lock);
+
+ debug_rt_mutex_unlock(lock);
+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ return false; /* done */
+ }
+
+ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
+ return true; /* deboost and wakeups */
+}
+
+void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+{
+ WAKE_Q(wake_q);
+ WAKE_Q(wake_sleeper_q);
+ bool deboost;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ deboost = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (deboost) {
+ wake_up_q(&wake_q);
+ wake_up_q_sleeper(&wake_sleeper_q);
+ rt_mutex_adjust_prio(current);
}
- return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
}
/**
@@ -2249,7 +2223,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
rt_mutex_init(lock);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
- rt_mutex_deadlock_account_lock(lock, proxy_owner);
}
/**
@@ -2265,34 +2238,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
{
debug_rt_mutex_proxy_unlock(lock);
rt_mutex_set_owner(lock, NULL);
- rt_mutex_deadlock_account_unlock(proxy_owner);
}
-/**
- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
- * @lock: the rt_mutex to take
- * @waiter: the pre-initialized rt_mutex_waiter
- * @task: the task to prepare
- *
- * Returns:
- * 0 - task blocked on lock
- * 1 - acquired the lock for task, caller should wake it up
- * <0 - error
- *
- * Special API call for FUTEX_REQUEUE_PI support.
- */
-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task)
{
int ret;
- raw_spin_lock_irq(&lock->wait_lock);
-
- if (try_to_take_rt_mutex(lock, task, NULL)) {
- raw_spin_unlock_irq(&lock->wait_lock);
+ if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
- }
#ifdef CONFIG_PREEMPT_RT_FULL
/*
@@ -2340,14 +2295,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
if (ret && rt_mutex_has_waiters(lock))
remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
-
debug_rt_mutex_print_deadlock(waiter);
return ret;
}
/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock: the rt_mutex to take
+ * @waiter: the pre-initialized rt_mutex_waiter
+ * @task: the task to prepare
+ *
+ * Returns:
+ * 0 - task blocked on lock
+ * 1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+{
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
* rt_mutex_next_owner - return the next owner of the lock
*
* @lock: the rt lock query
@@ -2368,21 +2347,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
}
/**
- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
* @lock: the rt_mutex we were woken on
* @to: the timeout, null if none. hrtimer should already have
* been started.
* @waiter: the pre-initialized rt_mutex_waiter
*
- * Complete the lock acquisition started our behalf by another thread.
+ * Wait for the the lock acquisition started on our behalf by
+ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
+ * rt_mutex_cleanup_proxy_lock().
*
* Returns:
* 0 - success
* <0 - error, one of -EINTR, -ETIMEDOUT
*
- * Special API call for PI-futex requeue support
+ * Special API call for PI-futex support
*/
-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
struct hrtimer_sleeper *to,
struct rt_mutex_waiter *waiter)
{
@@ -2395,8 +2376,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
/* sleep on the mutex */
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
- if (unlikely(ret))
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+}
+
+/**
+ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
+ *
+ * Unless we acquired the lock; we're still enqueued on the wait-list and can
+ * in fact still be granted ownership until we're removed. Therefore we can
+ * find we are in fact the owner and must disregard the
+ * rt_mutex_wait_proxy_lock() failure.
+ *
+ * Returns:
+ * true - did the cleanup, we done.
+ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
+ * caller should disregards its return value.
+ *
+ * Special API call for PI-futex support
+ */
+bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter)
+{
+ bool cleanup = false;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
remove_waiter(lock, waiter);
+ fixup_rt_mutex_waiters(lock);
+ cleanup = true;
+ }
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
@@ -2406,7 +2424,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
raw_spin_unlock_irq(&lock->wait_lock);
- return ret;
+ return cleanup;
}
static inline int
diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h
--- a/kernel/locking/rtmutex.h
+++ b/kernel/locking/rtmutex.h
@@ -11,8 +11,6 @@
*/
#define rt_mutex_deadlock_check(l) (0)
-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
-#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
#define debug_rt_mutex_init_waiter(w) do { } while (0)
#define debug_rt_mutex_free_waiter(w) do { } while (0)
#define debug_rt_mutex_lock(l) do { } while (0)
diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h
--- a/kernel/locking/rtmutex_common.h
+++ b/kernel/locking/rtmutex_common.h
@@ -107,16 +107,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner);
extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
struct task_struct *proxy_owner);
+extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
+extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
struct rt_mutex_waiter *waiter,
struct task_struct *task);
-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
- struct hrtimer_sleeper *to,
- struct rt_mutex_waiter *waiter);
-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
- struct wake_q_head *wqh,
- struct wake_q_head *wq_sleeper);
+extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
+extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
+
+extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+
+extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh,
+ struct wake_q_head *wq_sleeper);
+
extern void rt_mutex_adjust_prio(struct task_struct *task);
#ifdef CONFIG_DEBUG_RT_MUTEXES
@@ -125,14 +135,4 @@ extern void rt_mutex_adjust_prio(struct task_struct *task);
# include "rtmutex.h"
#endif
-static inline void
-rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
-{
- debug_rt_mutex_init_waiter(waiter);
- waiter->task = NULL;
- waiter->savestate = savestate;
- RB_CLEAR_NODE(&waiter->pi_tree_entry);
- RB_CLEAR_NODE(&waiter->tree_entry);
-}
-
#endif
diff --git a/kernel/module.c b/kernel/module.c
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -677,8 +677,12 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr)
void *va = (void *)addr;
if (va >= start && va < start + mod->percpu_size) {
- if (can_addr)
+ if (can_addr) {
*can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(mod->percpu,
+ get_boot_cpu_id());
+ }
preempt_enable();
return true;
}
diff --git a/localversion-rt b/localversion-rt
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt13
+-rt14
diff --git a/mm/percpu.c b/mm/percpu.c
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -1295,8 +1295,11 @@ bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr)
void *va = (void *)addr;
if (va >= start && va < start + static_size) {
- if (can_addr)
+ if (can_addr) {
*can_addr = (unsigned long) (va - start);
+ *can_addr += (unsigned long)
+ per_cpu_ptr(base, get_boot_cpu_id());
+ }
return true;
}
}
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
28 files changed, 2660 insertions, 324 deletions
diff --git a/patches/0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch b/patches/0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch new file mode 100644 index 000000000000..128cf8001839 --- /dev/null +++ b/patches/0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch @@ -0,0 +1,117 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:48 +0100 +Subject: [PATCH] futex: Cleanup variable names for futex_top_waiter() + +Upstream commit 499f5aca2cdd5e958b27e2655e7e7f82524f46b1 + +futex_top_waiter() returns the top-waiter on the pi_mutex. Assinging +this to a variable 'match' totally obscures the code. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.554710645@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 30 +++++++++++++++--------------- + 1 file changed, 15 insertions(+), 15 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1120,14 +1120,14 @@ static int attach_to_pi_owner(u32 uval, + static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) + { +- struct futex_q *match = futex_top_waiter(hb, key); ++ struct futex_q *top_waiter = futex_top_waiter(hb, key); + + /* + * If there is a waiter on that futex, validate it and + * attach to the pi_state when the validation succeeds. + */ +- if (match) +- return attach_to_pi_state(uval, match->pi_state, ps); ++ if (top_waiter) ++ return attach_to_pi_state(uval, top_waiter->pi_state, ps); + + /* + * We are the first waiter - try to look up the owner based on +@@ -1174,7 +1174,7 @@ static int futex_lock_pi_atomic(u32 __us + struct task_struct *task, int set_waiters) + { + u32 uval, newval, vpid = task_pid_vnr(task); +- struct futex_q *match; ++ struct futex_q *top_waiter; + int ret; + + /* +@@ -1200,9 +1200,9 @@ static int futex_lock_pi_atomic(u32 __us + * Lookup existing state first. If it exists, try to attach to + * its pi_state. + */ +- match = futex_top_waiter(hb, key); +- if (match) +- return attach_to_pi_state(uval, match->pi_state, ps); ++ top_waiter = futex_top_waiter(hb, key); ++ if (top_waiter) ++ return attach_to_pi_state(uval, top_waiter->pi_state, ps); + + /* + * No waiter and user TID is 0. We are here because the +@@ -1292,11 +1292,11 @@ static void mark_wake_futex(struct wake_ + q->lock_ptr = NULL; + } + +-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, ++static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter, + struct futex_hash_bucket *hb) + { + struct task_struct *new_owner; +- struct futex_pi_state *pi_state = this->pi_state; ++ struct futex_pi_state *pi_state = top_waiter->pi_state; + u32 uninitialized_var(curval), newval; + WAKE_Q(wake_q); + bool deboost; +@@ -1317,11 +1317,11 @@ static int wake_futex_pi(u32 __user *uad + + /* + * It is possible that the next waiter (the one that brought +- * this owner to the kernel) timed out and is no longer ++ * top_waiter owner to the kernel) timed out and is no longer + * waiting on the lock. + */ + if (!new_owner) +- new_owner = this->task; ++ new_owner = top_waiter->task; + + /* + * We pass it to the next owner. The WAITERS bit is always +@@ -2631,7 +2631,7 @@ static int futex_unlock_pi(u32 __user *u + u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); + union futex_key key = FUTEX_KEY_INIT; + struct futex_hash_bucket *hb; +- struct futex_q *match; ++ struct futex_q *top_waiter; + int ret; + + retry: +@@ -2655,9 +2655,9 @@ static int futex_unlock_pi(u32 __user *u + * all and we at least want to know if user space fiddled + * with the futex value instead of blindly unlocking. + */ +- match = futex_top_waiter(hb, &key); +- if (match) { +- ret = wake_futex_pi(uaddr, uval, match, hb); ++ top_waiter = futex_top_waiter(hb, &key); ++ if (top_waiter) { ++ ret = wake_futex_pi(uaddr, uval, top_waiter, hb); + /* + * In case of success wake_futex_pi dropped the hash + * bucket lock. diff --git a/patches/0002-futex-Use-smp_store_release-in-mark_wake_futex.patch b/patches/0002-futex-Use-smp_store_release-in-mark_wake_futex.patch new file mode 100644 index 000000000000..29c184a5184d --- /dev/null +++ b/patches/0002-futex-Use-smp_store_release-in-mark_wake_futex.patch @@ -0,0 +1,38 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:49 +0100 +Subject: [PATCH] futex: Use smp_store_release() in mark_wake_futex() + +Upstream commit 1b367ece0d7e696cab1c8501bab282cc6a538b3f + +Since the futex_q can dissapear the instruction after assigning NULL, +this really should be a RELEASE barrier. That stops loads from hitting +dead memory too. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.604296452@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1288,8 +1288,7 @@ static void mark_wake_futex(struct wake_ + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. + */ +- smp_wmb(); +- q->lock_ptr = NULL; ++ smp_store_release(&q->lock_ptr, NULL); + } + + static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter, diff --git a/patches/0003-futex-Remove-rt_mutex_deadlock_account_.patch b/patches/0003-futex-Remove-rt_mutex_deadlock_account_.patch new file mode 100644 index 000000000000..630982fb1310 --- /dev/null +++ b/patches/0003-futex-Remove-rt_mutex_deadlock_account_.patch @@ -0,0 +1,184 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:50 +0100 +Subject: [PATCH] futex: Remove rt_mutex_deadlock_account_*() + +Upstream commit fffa954fb528963c2fb7b0c0084eb77e2be7ab52 + +These are unused and clutter up the code. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.652692478@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/locking/rtmutex-debug.c | 9 ------- + kernel/locking/rtmutex-debug.h | 3 -- + kernel/locking/rtmutex.c | 47 +++++++++++++++-------------------------- + kernel/locking/rtmutex.h | 2 - + 4 files changed, 18 insertions(+), 43 deletions(-) + +--- a/kernel/locking/rtmutex-debug.c ++++ b/kernel/locking/rtmutex-debug.c +@@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex + lock->name = name; + } + +-void +-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) +-{ +-} +- +-void rt_mutex_deadlock_account_unlock(struct task_struct *task) +-{ +-} +- +--- a/kernel/locking/rtmutex-debug.h ++++ b/kernel/locking/rtmutex-debug.h +@@ -9,9 +9,6 @@ + * This file contains macros used solely by rtmutex.c. Debug version. + */ + +-extern void +-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); +-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); + extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); + extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); + extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -936,8 +936,6 @@ static int try_to_take_rt_mutex(struct r + */ + rt_mutex_set_owner(lock, task); + +- rt_mutex_deadlock_account_lock(lock, task); +- + return 1; + } + +@@ -1340,8 +1338,6 @@ static bool __sched rt_mutex_slowunlock( + + debug_rt_mutex_unlock(lock); + +- rt_mutex_deadlock_account_unlock(current); +- + /* + * We must be careful here if the fast path is enabled. If we + * have no waiters queued we cannot set owner to NULL here +@@ -1407,11 +1403,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk)) + { +- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { +- rt_mutex_deadlock_account_lock(lock, current); ++ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 0; +- } else +- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); ++ ++ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); + } + + static inline int +@@ -1423,21 +1418,19 @@ rt_mutex_timed_fastlock(struct rt_mutex + enum rtmutex_chainwalk chwalk)) + { + if (chwalk == RT_MUTEX_MIN_CHAINWALK && +- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { +- rt_mutex_deadlock_account_lock(lock, current); ++ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 0; +- } else +- return slowfn(lock, state, timeout, chwalk); ++ ++ return slowfn(lock, state, timeout, chwalk); + } + + static inline int + rt_mutex_fasttrylock(struct rt_mutex *lock, + int (*slowfn)(struct rt_mutex *lock)) + { +- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { +- rt_mutex_deadlock_account_lock(lock, current); ++ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return 1; +- } ++ + return slowfn(lock); + } + +@@ -1447,19 +1440,18 @@ rt_mutex_fastunlock(struct rt_mutex *loc + struct wake_q_head *wqh)) + { + WAKE_Q(wake_q); ++ bool deboost; + +- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { +- rt_mutex_deadlock_account_unlock(current); ++ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) ++ return; + +- } else { +- bool deboost = slowfn(lock, &wake_q); ++ deboost = slowfn(lock, &wake_q); + +- wake_up_q(&wake_q); ++ wake_up_q(&wake_q); + +- /* Undo pi boosting if necessary: */ +- if (deboost) +- rt_mutex_adjust_prio(current); +- } ++ /* Undo pi boosting if necessary: */ ++ if (deboost) ++ rt_mutex_adjust_prio(current); + } + + /** +@@ -1570,10 +1562,9 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); + bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh) + { +- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { +- rt_mutex_deadlock_account_unlock(current); ++ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return false; +- } ++ + return rt_mutex_slowunlock(lock, wqh); + } + +@@ -1631,7 +1622,6 @@ void rt_mutex_init_proxy_locked(struct r + __rt_mutex_init(lock, NULL); + debug_rt_mutex_proxy_lock(lock, proxy_owner); + rt_mutex_set_owner(lock, proxy_owner); +- rt_mutex_deadlock_account_lock(lock, proxy_owner); + } + + /** +@@ -1647,7 +1637,6 @@ void rt_mutex_proxy_unlock(struct rt_mut + { + debug_rt_mutex_proxy_unlock(lock); + rt_mutex_set_owner(lock, NULL); +- rt_mutex_deadlock_account_unlock(proxy_owner); + } + + /** +--- a/kernel/locking/rtmutex.h ++++ b/kernel/locking/rtmutex.h +@@ -11,8 +11,6 @@ + */ + + #define rt_mutex_deadlock_check(l) (0) +-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) +-#define rt_mutex_deadlock_account_unlock(l) do { } while (0) + #define debug_rt_mutex_init_waiter(w) do { } while (0) + #define debug_rt_mutex_free_waiter(w) do { } while (0) + #define debug_rt_mutex_lock(l) do { } while (0) diff --git a/patches/0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch b/patches/0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch new file mode 100644 index 000000000000..5f39524b167b --- /dev/null +++ b/patches/0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch @@ -0,0 +1,220 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:51 +0100 +Subject: [PATCH] futex,rt_mutex: Provide futex specific rt_mutex API + +Upstream commit 5293c2efda37775346885c7e924d4ef7018ea60b + +Part of what makes futex_unlock_pi() intricate is that +rt_mutex_futex_unlock() -> rt_mutex_slowunlock() can drop +rt_mutex::wait_lock. + +This means it cannot rely on the atomicy of wait_lock, which would be +preferred in order to not rely on hb->lock so much. + +The reason rt_mutex_slowunlock() needs to drop wait_lock is because it can +race with the rt_mutex fastpath, however futexes have their own fast path. + +Since futexes already have a bunch of separate rt_mutex accessors, complete +that set and implement a rt_mutex variant without fastpath for them. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.702962446@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 30 ++++++++++----------- + kernel/locking/rtmutex.c | 55 +++++++++++++++++++++++++++++----------- + kernel/locking/rtmutex_common.h | 9 +++++- + 3 files changed, 62 insertions(+), 32 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -914,7 +914,7 @@ void exit_pi_state_list(struct task_stru + pi_state->owner = NULL; + raw_spin_unlock_irq(&curr->pi_lock); + +- rt_mutex_unlock(&pi_state->pi_mutex); ++ rt_mutex_futex_unlock(&pi_state->pi_mutex); + + spin_unlock(&hb->lock); + +@@ -1362,20 +1362,18 @@ static int wake_futex_pi(u32 __user *uad + pi_state->owner = new_owner; + raw_spin_unlock(&new_owner->pi_lock); + +- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); +- +- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); +- + /* +- * First unlock HB so the waiter does not spin on it once he got woken +- * up. Second wake up the waiter before the priority is adjusted. If we +- * deboost first (and lose our higher priority), then the task might get +- * scheduled away before the wake up can take place. ++ * We've updated the uservalue, this unlock cannot fail. + */ ++ deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); ++ ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); +- wake_up_q(&wake_q); +- if (deboost) ++ ++ if (deboost) { ++ wake_up_q(&wake_q); + rt_mutex_adjust_prio(current); ++ } + + return 0; + } +@@ -2251,7 +2249,7 @@ static int fixup_owner(u32 __user *uaddr + * task acquired the rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ +- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { ++ if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) { + locked = 1; + goto out; + } +@@ -2566,7 +2564,7 @@ static int futex_lock_pi(u32 __user *uad + if (!trylock) { + ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); + } else { +- ret = rt_mutex_trylock(&q.pi_state->pi_mutex); ++ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; + } +@@ -2589,7 +2587,7 @@ static int futex_lock_pi(u32 __user *uad + * it and return the fault to userspace. + */ + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) +- rt_mutex_unlock(&q.pi_state->pi_mutex); ++ rt_mutex_futex_unlock(&q.pi_state->pi_mutex); + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); +@@ -2896,7 +2894,7 @@ static int futex_wait_requeue_pi(u32 __u + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) +- rt_mutex_unlock(&q.pi_state->pi_mutex); ++ rt_mutex_futex_unlock(&q.pi_state->pi_mutex); + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. +@@ -2936,7 +2934,7 @@ static int futex_wait_requeue_pi(u32 __u + * userspace. + */ + if (ret && rt_mutex_owner(pi_mutex) == current) +- rt_mutex_unlock(pi_mutex); ++ rt_mutex_futex_unlock(pi_mutex); + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1486,15 +1486,23 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interrup + + /* + * Futex variant with full deadlock detection. ++ * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock(). + */ +-int rt_mutex_timed_futex_lock(struct rt_mutex *lock, ++int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *timeout) + { + might_sleep(); + +- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, +- RT_MUTEX_FULL_CHAINWALK, +- rt_mutex_slowlock); ++ return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, ++ timeout, RT_MUTEX_FULL_CHAINWALK); ++} ++ ++/* ++ * Futex variant, must not use fastpath. ++ */ ++int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) ++{ ++ return rt_mutex_slowtrylock(lock); + } + + /** +@@ -1553,19 +1561,38 @@ void __sched rt_mutex_unlock(struct rt_m + EXPORT_SYMBOL_GPL(rt_mutex_unlock); + + /** +- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock +- * @lock: the rt_mutex to be unlocked +- * +- * Returns: true/false indicating whether priority adjustment is +- * required or not. ++ * Futex variant, that since futex variants do not use the fast-path, can be ++ * simple and will not need to retry. + */ +-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, +- struct wake_q_head *wqh) ++bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, ++ struct wake_q_head *wake_q) ++{ ++ lockdep_assert_held(&lock->wait_lock); ++ ++ debug_rt_mutex_unlock(lock); ++ ++ if (!rt_mutex_has_waiters(lock)) { ++ lock->owner = NULL; ++ return false; /* done */ ++ } ++ ++ mark_wakeup_next_waiter(wake_q, lock); ++ return true; /* deboost and wakeups */ ++} ++ ++void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) + { +- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) +- return false; ++ WAKE_Q(wake_q); ++ bool deboost; + +- return rt_mutex_slowunlock(lock, wqh); ++ raw_spin_lock_irq(&lock->wait_lock); ++ deboost = __rt_mutex_futex_unlock(lock, &wake_q); ++ raw_spin_unlock_irq(&lock->wait_lock); ++ ++ if (deboost) { ++ wake_up_q(&wake_q); ++ rt_mutex_adjust_prio(current); ++ } + } + + /** +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -109,9 +109,14 @@ extern int rt_mutex_start_proxy_lock(str + extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); ++ + extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); +-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, +- struct wake_q_head *wqh); ++extern int rt_mutex_futex_trylock(struct rt_mutex *l); ++ ++extern void rt_mutex_futex_unlock(struct rt_mutex *lock); ++extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, ++ struct wake_q_head *wqh); ++ + extern void rt_mutex_adjust_prio(struct task_struct *task); + + #ifdef CONFIG_DEBUG_RT_MUTEXES diff --git a/patches/0005-futex-Change-locking-rules.patch b/patches/0005-futex-Change-locking-rules.patch new file mode 100644 index 000000000000..a6a3f0ad08fe --- /dev/null +++ b/patches/0005-futex-Change-locking-rules.patch @@ -0,0 +1,370 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:52 +0100 +Subject: [PATCH] futex: Change locking rules + +Upstream commit 734009e96d1983ad739e5b656e03430b3660c913 + +Currently futex-pi relies on hb->lock to serialize everything. But hb->lock +creates another set of problems, especially priority inversions on RT where +hb->lock becomes a rt_mutex itself. + +The rt_mutex::wait_lock is the most obvious protection for keeping the +futex user space value and the kernel internal pi_state in sync. + +Rework and document the locking so rt_mutex::wait_lock is held accross all +operations which modify the user space value and the pi state. + +This allows to invoke rt_mutex_unlock() (including deboost) without holding +hb->lock as a next step. + +Nothing yet relies on the new locking rules. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 165 +++++++++++++++++++++++++++++++++++++++++++++------------ + 1 file changed, 132 insertions(+), 33 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -971,6 +971,39 @@ void exit_pi_state_list(struct task_stru + * + * [10] There is no transient state which leaves owner and user space + * TID out of sync. ++ * ++ * ++ * Serialization and lifetime rules: ++ * ++ * hb->lock: ++ * ++ * hb -> futex_q, relation ++ * futex_q -> pi_state, relation ++ * ++ * (cannot be raw because hb can contain arbitrary amount ++ * of futex_q's) ++ * ++ * pi_mutex->wait_lock: ++ * ++ * {uval, pi_state} ++ * ++ * (and pi_mutex 'obviously') ++ * ++ * p->pi_lock: ++ * ++ * p->pi_state_list -> pi_state->list, relation ++ * ++ * pi_state->refcount: ++ * ++ * pi_state lifetime ++ * ++ * ++ * Lock order: ++ * ++ * hb->lock ++ * pi_mutex->wait_lock ++ * p->pi_lock ++ * + */ + + /* +@@ -978,10 +1011,12 @@ void exit_pi_state_list(struct task_stru + * the pi_state against the user space value. If correct, attach to + * it. + */ +-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, ++static int attach_to_pi_state(u32 __user *uaddr, u32 uval, ++ struct futex_pi_state *pi_state, + struct futex_pi_state **ps) + { + pid_t pid = uval & FUTEX_TID_MASK; ++ int ret, uval2; + + /* + * Userspace might have messed up non-PI and PI futexes [3] +@@ -989,9 +1024,34 @@ static int attach_to_pi_state(u32 uval, + if (unlikely(!pi_state)) + return -EINVAL; + ++ /* ++ * We get here with hb->lock held, and having found a ++ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q ++ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), ++ * which in turn means that futex_lock_pi() still has a reference on ++ * our pi_state. ++ */ + WARN_ON(!atomic_read(&pi_state->refcount)); + + /* ++ * Now that we have a pi_state, we can acquire wait_lock ++ * and do the state validation. ++ */ ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ ++ /* ++ * Since {uval, pi_state} is serialized by wait_lock, and our current ++ * uval was read without holding it, it can have changed. Verify it ++ * still is what we expect it to be, otherwise retry the entire ++ * operation. ++ */ ++ if (get_futex_value_locked(&uval2, uaddr)) ++ goto out_efault; ++ ++ if (uval != uval2) ++ goto out_eagain; ++ ++ /* + * Handle the owner died case: + */ + if (uval & FUTEX_OWNER_DIED) { +@@ -1006,11 +1066,11 @@ static int attach_to_pi_state(u32 uval, + * is not 0. Inconsistent state. [5] + */ + if (pid) +- return -EINVAL; ++ goto out_einval; + /* + * Take a ref on the state and return success. [4] + */ +- goto out_state; ++ goto out_attach; + } + + /* +@@ -1022,14 +1082,14 @@ static int attach_to_pi_state(u32 uval, + * Take a ref on the state and return success. [6] + */ + if (!pid) +- goto out_state; ++ goto out_attach; + } else { + /* + * If the owner died bit is not set, then the pi_state + * must have an owner. [7] + */ + if (!pi_state->owner) +- return -EINVAL; ++ goto out_einval; + } + + /* +@@ -1038,11 +1098,29 @@ static int attach_to_pi_state(u32 uval, + * user space TID. [9/10] + */ + if (pid != task_pid_vnr(pi_state->owner)) +- return -EINVAL; +-out_state: ++ goto out_einval; ++ ++out_attach: + atomic_inc(&pi_state->refcount); ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + *ps = pi_state; + return 0; ++ ++out_einval: ++ ret = -EINVAL; ++ goto out_error; ++ ++out_eagain: ++ ret = -EAGAIN; ++ goto out_error; ++ ++out_efault: ++ ret = -EFAULT; ++ goto out_error; ++ ++out_error: ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ return ret; + } + + /* +@@ -1093,6 +1171,9 @@ static int attach_to_pi_owner(u32 uval, + + /* + * No existing pi state. First waiter. [2] ++ * ++ * This creates pi_state, we have hb->lock held, this means nothing can ++ * observe this state, wait_lock is irrelevant. + */ + pi_state = alloc_pi_state(); + +@@ -1117,7 +1198,8 @@ static int attach_to_pi_owner(u32 uval, + return 0; + } + +-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, ++static int lookup_pi_state(u32 __user *uaddr, u32 uval, ++ struct futex_hash_bucket *hb, + union futex_key *key, struct futex_pi_state **ps) + { + struct futex_q *top_waiter = futex_top_waiter(hb, key); +@@ -1127,7 +1209,7 @@ static int lookup_pi_state(u32 uval, str + * attach to the pi_state when the validation succeeds. + */ + if (top_waiter) +- return attach_to_pi_state(uval, top_waiter->pi_state, ps); ++ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); + + /* + * We are the first waiter - try to look up the owner based on +@@ -1146,7 +1228,7 @@ static int lock_pi_update_atomic(u32 __u + if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) + return -EFAULT; + +- /*If user space value changed, let the caller retry */ ++ /* If user space value changed, let the caller retry */ + return curval != uval ? -EAGAIN : 0; + } + +@@ -1202,7 +1284,7 @@ static int futex_lock_pi_atomic(u32 __us + */ + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) +- return attach_to_pi_state(uval, top_waiter->pi_state, ps); ++ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); + + /* + * No waiter and user TID is 0. We are here because the +@@ -1334,6 +1416,7 @@ static int wake_futex_pi(u32 __user *uad + + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { + ret = -EFAULT; ++ + } else if (curval != uval) { + /* + * If a unconditional UNLOCK_PI operation (user space did not +@@ -1346,6 +1429,7 @@ static int wake_futex_pi(u32 __user *uad + else + ret = -EINVAL; + } ++ + if (ret) { + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; +@@ -1821,7 +1905,7 @@ static int futex_requeue(u32 __user *uad + * If that call succeeds then we have pi_state and an + * initial refcount on it. + */ +- ret = lookup_pi_state(ret, hb2, &key2, &pi_state); ++ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); + } + + switch (ret) { +@@ -2120,10 +2204,13 @@ static int fixup_pi_state_owner(u32 __us + { + u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; + struct futex_pi_state *pi_state = q->pi_state; +- struct task_struct *oldowner = pi_state->owner; + u32 uval, uninitialized_var(curval), newval; ++ struct task_struct *oldowner; + int ret; + ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ ++ oldowner = pi_state->owner; + /* Owner died? */ + if (!pi_state->owner) + newtid |= FUTEX_OWNER_DIED; +@@ -2139,11 +2226,10 @@ static int fixup_pi_state_owner(u32 __us + * because we can fault here. Imagine swapped out pages or a fork + * that marked all the anonymous memory readonly for cow. + * +- * Modifying pi_state _before_ the user space value would +- * leave the pi_state in an inconsistent state when we fault +- * here, because we need to drop the hash bucket lock to +- * handle the fault. This might be observed in the PID check +- * in lookup_pi_state. ++ * Modifying pi_state _before_ the user space value would leave the ++ * pi_state in an inconsistent state when we fault here, because we ++ * need to drop the locks to handle the fault. This might be observed ++ * in the PID check in lookup_pi_state. + */ + retry: + if (get_futex_value_locked(&uval, uaddr)) +@@ -2164,47 +2250,60 @@ static int fixup_pi_state_owner(u32 __us + * itself. + */ + if (pi_state->owner != NULL) { +- raw_spin_lock_irq(&pi_state->owner->pi_lock); ++ raw_spin_lock(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); +- raw_spin_unlock_irq(&pi_state->owner->pi_lock); ++ raw_spin_unlock(&pi_state->owner->pi_lock); + } + + pi_state->owner = newowner; + +- raw_spin_lock_irq(&newowner->pi_lock); ++ raw_spin_lock(&newowner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &newowner->pi_state_list); +- raw_spin_unlock_irq(&newowner->pi_lock); ++ raw_spin_unlock(&newowner->pi_lock); ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ + return 0; + + /* +- * To handle the page fault we need to drop the hash bucket +- * lock here. That gives the other task (either the highest priority +- * waiter itself or the task which stole the rtmutex) the +- * chance to try the fixup of the pi_state. So once we are +- * back from handling the fault we need to check the pi_state +- * after reacquiring the hash bucket lock and before trying to +- * do another fixup. When the fixup has been done already we +- * simply return. ++ * To handle the page fault we need to drop the locks here. That gives ++ * the other task (either the highest priority waiter itself or the ++ * task which stole the rtmutex) the chance to try the fixup of the ++ * pi_state. So once we are back from handling the fault we need to ++ * check the pi_state after reacquiring the locks and before trying to ++ * do another fixup. When the fixup has been done already we simply ++ * return. ++ * ++ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely ++ * drop hb->lock since the caller owns the hb -> futex_q relation. ++ * Dropping the pi_mutex->wait_lock requires the state revalidate. + */ + handle_fault: ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(q->lock_ptr); + + ret = fault_in_user_writeable(uaddr); + + spin_lock(q->lock_ptr); ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Check if someone else fixed it for us: + */ +- if (pi_state->owner != oldowner) +- return 0; ++ if (pi_state->owner != oldowner) { ++ ret = 0; ++ goto out_unlock; ++ } + + if (ret) +- return ret; ++ goto out_unlock; + + goto retry; ++ ++out_unlock: ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ return ret; + } + + static long futex_wait_restart(struct restart_block *restart); diff --git a/patches/0006-futex-Cleanup-refcounting.patch b/patches/0006-futex-Cleanup-refcounting.patch new file mode 100644 index 000000000000..e1e7b05733c8 --- /dev/null +++ b/patches/0006-futex-Cleanup-refcounting.patch @@ -0,0 +1,75 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:53 +0100 +Subject: [PATCH] futex: Cleanup refcounting + +Upstream commit bf92cf3a5100f5a0d5f9834787b130159397cb22 + +Add a put_pit_state() as counterpart for get_pi_state() so the refcounting +becomes consistent. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.801778516@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 13 +++++++++---- + 1 file changed, 9 insertions(+), 4 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -800,7 +800,7 @@ static int refill_pi_state_cache(void) + return 0; + } + +-static struct futex_pi_state * alloc_pi_state(void) ++static struct futex_pi_state *alloc_pi_state(void) + { + struct futex_pi_state *pi_state = current->pi_state_cache; + +@@ -810,6 +810,11 @@ static struct futex_pi_state * alloc_pi_ + return pi_state; + } + ++static void get_pi_state(struct futex_pi_state *pi_state) ++{ ++ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); ++} ++ + /* + * Drops a reference to the pi_state object and frees or caches it + * when the last reference is gone. +@@ -854,7 +859,7 @@ static void put_pi_state(struct futex_pi + * Look up the task based on what TID userspace gave us. + * We dont trust it. + */ +-static struct task_struct * futex_find_get_task(pid_t pid) ++static struct task_struct *futex_find_get_task(pid_t pid) + { + struct task_struct *p; + +@@ -1101,7 +1106,7 @@ static int attach_to_pi_state(u32 __user + goto out_einval; + + out_attach: +- atomic_inc(&pi_state->refcount); ++ get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + *ps = pi_state; + return 0; +@@ -1988,7 +1993,7 @@ static int futex_requeue(u32 __user *uad + * refcount on the pi_state and store the pointer in + * the futex_q object of the waiter. + */ +- atomic_inc(&pi_state->refcount); ++ get_pi_state(pi_state); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, diff --git a/patches/0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch b/patches/0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch new file mode 100644 index 000000000000..c07c8076e29b --- /dev/null +++ b/patches/0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch @@ -0,0 +1,139 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:54 +0100 +Subject: [PATCH] futex: Rework inconsistent rt_mutex/futex_q state + +Upstream commit 73d786bd043ebc855f349c81ea805f6b11cbf2aa + +There is a weird state in the futex_unlock_pi() path when it interleaves +with a concurrent futex_lock_pi() at the point where it drops hb->lock. + +In this case, it can happen that the rt_mutex wait_list and the futex_q +disagree on pending waiters, in particular rt_mutex will find no pending +waiters where futex_q thinks there are. In this case the rt_mutex unlock +code cannot assign an owner. + +The futex side fixup code has to cleanup the inconsistencies with quite a +bunch of interesting corner cases. + +Simplify all this by changing wake_futex_pi() to return -EAGAIN when this +situation occurs. This then gives the futex_lock_pi() code the opportunity +to continue and the retried futex_unlock_pi() will now observe a coherent +state. + +The only problem is that this breaks RT timeliness guarantees. That +is, consider the following scenario: + + T1 and T2 are both pinned to CPU0. prio(T2) > prio(T1) + + CPU0 + + T1 + lock_pi() + queue_me() <- Waiter is visible + + preemption + + T2 + unlock_pi() + loops with -EAGAIN forever + +Which is undesirable for PI primitives. Future patches will rectify +this. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.850383690@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 50 ++++++++++++++------------------------------------ + 1 file changed, 14 insertions(+), 36 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1402,12 +1402,19 @@ static int wake_futex_pi(u32 __user *uad + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + + /* +- * It is possible that the next waiter (the one that brought +- * top_waiter owner to the kernel) timed out and is no longer +- * waiting on the lock. ++ * When we interleave with futex_lock_pi() where it does ++ * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter, ++ * but the rt_mutex's wait_list can be empty (either still, or again, ++ * depending on which side we land). ++ * ++ * When this happens, give up our locks and try again, giving the ++ * futex_lock_pi() instance time to complete, either by waiting on the ++ * rtmutex or removing itself from the futex queue. + */ +- if (!new_owner) +- new_owner = top_waiter->task; ++ if (!new_owner) { ++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); ++ return -EAGAIN; ++ } + + /* + * We pass it to the next owner. The WAITERS bit is always +@@ -2330,7 +2337,6 @@ static long futex_wait_restart(struct re + */ + static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) + { +- struct task_struct *owner; + int ret = 0; + + if (locked) { +@@ -2344,43 +2350,15 @@ static int fixup_owner(u32 __user *uaddr + } + + /* +- * Catch the rare case, where the lock was released when we were on the +- * way back before we locked the hash bucket. +- */ +- if (q->pi_state->owner == current) { +- /* +- * Try to get the rt_mutex now. This might fail as some other +- * task acquired the rt_mutex after we removed ourself from the +- * rt_mutex waiters list. +- */ +- if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) { +- locked = 1; +- goto out; +- } +- +- /* +- * pi_state is incorrect, some other task did a lock steal and +- * we returned due to timeout or signal without taking the +- * rt_mutex. Too late. +- */ +- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); +- owner = rt_mutex_owner(&q->pi_state->pi_mutex); +- if (!owner) +- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); +- raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); +- ret = fixup_pi_state_owner(uaddr, q, owner); +- goto out; +- } +- +- /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner of the rt_mutex. + */ +- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) ++ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); ++ } + + out: + return ret ? ret : locked; diff --git a/patches/0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch b/patches/0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch new file mode 100644 index 000000000000..53d4c2257a8a --- /dev/null +++ b/patches/0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch @@ -0,0 +1,357 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:55 +0100 +Subject: [PATCH] futex: Pull rt_mutex_futex_unlock() out from under hb->lock + +Upstream commit 16ffa12d742534d4ff73e8b3a4e81c1de39196f0 + +There's a number of 'interesting' problems, all caused by holding +hb->lock while doing the rt_mutex_unlock() equivalient. + +Notably: + + - a PI inversion on hb->lock; and, + + - a SCHED_DEADLINE crash because of pointer instability. + +The previous changes: + + - changed the locking rules to cover {uval,pi_state} with wait_lock. + + - allow to do rt_mutex_futex_unlock() without dropping wait_lock; which in + turn allows to rely on wait_lock atomicity completely. + + - simplified the waiter conundrum. + +It's now sufficient to hold rtmutex::wait_lock and a reference on the +pi_state to protect the state consistency, so hb->lock can be dropped +before calling rt_mutex_futex_unlock(). + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.900002056@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 154 +++++++++++++++++++++++++++++++++++++-------------------- + 1 file changed, 100 insertions(+), 54 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -919,10 +919,12 @@ void exit_pi_state_list(struct task_stru + pi_state->owner = NULL; + raw_spin_unlock_irq(&curr->pi_lock); + +- rt_mutex_futex_unlock(&pi_state->pi_mutex); +- ++ get_pi_state(pi_state); + spin_unlock(&hb->lock); + ++ rt_mutex_futex_unlock(&pi_state->pi_mutex); ++ put_pi_state(pi_state); ++ + raw_spin_lock_irq(&curr->pi_lock); + } + raw_spin_unlock_irq(&curr->pi_lock); +@@ -1035,6 +1037,11 @@ static int attach_to_pi_state(u32 __user + * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. ++ * ++ * The waiter holding a reference on @pi_state also protects against ++ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() ++ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently ++ * free pi_state before we can take a reference ourselves. + */ + WARN_ON(!atomic_read(&pi_state->refcount)); + +@@ -1378,48 +1385,40 @@ static void mark_wake_futex(struct wake_ + smp_store_release(&q->lock_ptr, NULL); + } + +-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter, +- struct futex_hash_bucket *hb) ++/* ++ * Caller must hold a reference on @pi_state. ++ */ ++static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) + { +- struct task_struct *new_owner; +- struct futex_pi_state *pi_state = top_waiter->pi_state; + u32 uninitialized_var(curval), newval; ++ struct task_struct *new_owner; ++ bool deboost = false; + WAKE_Q(wake_q); +- bool deboost; + int ret = 0; + +- if (!pi_state) +- return -EINVAL; +- +- /* +- * If current does not own the pi_state then the futex is +- * inconsistent and user space fiddled with the futex value. +- */ +- if (pi_state->owner != current) +- return -EINVAL; +- + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); +- +- /* +- * When we interleave with futex_lock_pi() where it does +- * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter, +- * but the rt_mutex's wait_list can be empty (either still, or again, +- * depending on which side we land). +- * +- * When this happens, give up our locks and try again, giving the +- * futex_lock_pi() instance time to complete, either by waiting on the +- * rtmutex or removing itself from the futex queue. +- */ + if (!new_owner) { +- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); +- return -EAGAIN; ++ /* ++ * Since we held neither hb->lock nor wait_lock when coming ++ * into this function, we could have raced with futex_lock_pi() ++ * such that we might observe @this futex_q waiter, but the ++ * rt_mutex's wait_list can be empty (either still, or again, ++ * depending on which side we land). ++ * ++ * When this happens, give up our locks and try again, giving ++ * the futex_lock_pi() instance time to complete, either by ++ * waiting on the rtmutex or removing itself from the futex ++ * queue. ++ */ ++ ret = -EAGAIN; ++ goto out_unlock; + } + + /* +- * We pass it to the next owner. The WAITERS bit is always +- * kept enabled while there is PI state around. We cleanup the +- * owner died bit, because we are the owner. ++ * We pass it to the next owner. The WAITERS bit is always kept ++ * enabled while there is PI state around. We cleanup the owner ++ * died bit, because we are the owner. + */ + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); + +@@ -1442,10 +1441,8 @@ static int wake_futex_pi(u32 __user *uad + ret = -EINVAL; + } + +- if (ret) { +- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); +- return ret; +- } ++ if (ret) ++ goto out_unlock; + + raw_spin_lock(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); +@@ -1463,15 +1460,15 @@ static int wake_futex_pi(u32 __user *uad + */ + deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + ++out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); +- spin_unlock(&hb->lock); + + if (deboost) { + wake_up_q(&wake_q); + rt_mutex_adjust_prio(current); + } + +- return 0; ++ return ret; + } + + /* +@@ -2230,7 +2227,8 @@ static int fixup_pi_state_owner(u32 __us + /* + * We are here either because we stole the rtmutex from the + * previous highest priority waiter or we are the highest priority +- * waiter but failed to get the rtmutex the first time. ++ * waiter but have failed to get the rtmutex the first time. ++ * + * We have to replace the newowner TID in the user space variable. + * This must be atomic as we have to preserve the owner died bit here. + * +@@ -2247,7 +2245,7 @@ static int fixup_pi_state_owner(u32 __us + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; + +- while (1) { ++ for (;;) { + newval = (uval & FUTEX_OWNER_DIED) | newtid; + + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) +@@ -2343,6 +2341,10 @@ static int fixup_owner(u32 __user *uaddr + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: ++ * ++ * We can safely read pi_state->owner without holding wait_lock ++ * because we now own the rt_mutex, only the owner will attempt ++ * to change it. + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current); +@@ -2582,6 +2584,7 @@ static int futex_lock_pi(u32 __user *uad + ktime_t *time, int trylock) + { + struct hrtimer_sleeper timeout, *to = NULL; ++ struct futex_pi_state *pi_state = NULL; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int res, ret; +@@ -2668,12 +2671,19 @@ static int futex_lock_pi(u32 __user *uad + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. + */ +- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) +- rt_mutex_futex_unlock(&q.pi_state->pi_mutex); ++ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { ++ pi_state = q.pi_state; ++ get_pi_state(pi_state); ++ } + + /* Unqueue and drop the lock */ + unqueue_me_pi(&q); + ++ if (pi_state) { ++ rt_mutex_futex_unlock(&pi_state->pi_mutex); ++ put_pi_state(pi_state); ++ } ++ + goto out_put_key; + + out_unlock_put_key: +@@ -2736,10 +2746,36 @@ static int futex_unlock_pi(u32 __user *u + */ + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { +- ret = wake_futex_pi(uaddr, uval, top_waiter, hb); ++ struct futex_pi_state *pi_state = top_waiter->pi_state; ++ ++ ret = -EINVAL; ++ if (!pi_state) ++ goto out_unlock; ++ ++ /* ++ * If current does not own the pi_state then the futex is ++ * inconsistent and user space fiddled with the futex value. ++ */ ++ if (pi_state->owner != current) ++ goto out_unlock; ++ ++ /* ++ * Grab a reference on the pi_state and drop hb->lock. ++ * ++ * The reference ensures pi_state lives, dropping the hb->lock ++ * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to ++ * close the races against futex_lock_pi(), but in case of ++ * _any_ fail we'll abort and retry the whole deal. ++ */ ++ get_pi_state(pi_state); ++ spin_unlock(&hb->lock); ++ ++ ret = wake_futex_pi(uaddr, uval, pi_state); ++ ++ put_pi_state(pi_state); ++ + /* +- * In case of success wake_futex_pi dropped the hash +- * bucket lock. ++ * Success, we're done! No tricky corner cases. + */ + if (!ret) + goto out_putkey; +@@ -2754,7 +2790,6 @@ static int futex_unlock_pi(u32 __user *u + * setting the FUTEX_WAITERS bit. Try again. + */ + if (ret == -EAGAIN) { +- spin_unlock(&hb->lock); + put_futex_key(&key); + goto retry; + } +@@ -2762,7 +2797,7 @@ static int futex_unlock_pi(u32 __user *u + * wake_futex_pi has detected invalid state. Tell user + * space. + */ +- goto out_unlock; ++ goto out_putkey; + } + + /* +@@ -2772,8 +2807,10 @@ static int futex_unlock_pi(u32 __user *u + * preserve the WAITERS bit not the OWNER_DIED one. We are the + * owner. + */ +- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) ++ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { ++ spin_unlock(&hb->lock); + goto pi_faulted; ++ } + + /* + * If uval has changed, let user space handle it. +@@ -2787,7 +2824,6 @@ static int futex_unlock_pi(u32 __user *u + return ret; + + pi_faulted: +- spin_unlock(&hb->lock); + put_futex_key(&key); + + ret = fault_in_user_writeable(uaddr); +@@ -2891,6 +2927,7 @@ static int futex_wait_requeue_pi(u32 __u + u32 __user *uaddr2) + { + struct hrtimer_sleeper timeout, *to = NULL; ++ struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + union futex_key key2 = FUTEX_KEY_INIT; +@@ -2975,8 +3012,10 @@ static int futex_wait_requeue_pi(u32 __u + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current); +- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) +- rt_mutex_futex_unlock(&q.pi_state->pi_mutex); ++ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { ++ pi_state = q.pi_state; ++ get_pi_state(pi_state); ++ } + /* + * Drop the reference to the pi state which + * the requeue_pi() code acquired for us. +@@ -3015,13 +3054,20 @@ static int futex_wait_requeue_pi(u32 __u + * the fault, unlock the rt_mutex and return the fault to + * userspace. + */ +- if (ret && rt_mutex_owner(pi_mutex) == current) +- rt_mutex_futex_unlock(pi_mutex); ++ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { ++ pi_state = q.pi_state; ++ get_pi_state(pi_state); ++ } + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + ++ if (pi_state) { ++ rt_mutex_futex_unlock(&pi_state->pi_mutex); ++ put_pi_state(pi_state); ++ } ++ + if (ret == -EINTR) { + /* + * We've already been requeued, but cannot restart by calling diff --git a/patches/0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch b/patches/0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch new file mode 100644 index 000000000000..10b1039f290e --- /dev/null +++ b/patches/0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch @@ -0,0 +1,79 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:56 +0100 +Subject: [PATCH] futex,rt_mutex: Introduce rt_mutex_init_waiter() + +Upstream commit 50809358dd7199aa7ce232f6877dd09ec30ef374 + +Since there's already two copies of this code, introduce a helper now +before adding a third one. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104151.950039479@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 5 +---- + kernel/locking/rtmutex.c | 12 +++++++++--- + kernel/locking/rtmutex_common.h | 1 + + 3 files changed, 11 insertions(+), 7 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2954,10 +2954,7 @@ static int futex_wait_requeue_pi(u32 __u + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ +- debug_rt_mutex_init_waiter(&rt_waiter); +- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); +- RB_CLEAR_NODE(&rt_waiter.tree_entry); +- rt_waiter.task = NULL; ++ rt_mutex_init_waiter(&rt_waiter); + + ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); + if (unlikely(ret != 0)) +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1151,6 +1151,14 @@ void rt_mutex_adjust_pi(struct task_stru + next_lock, NULL, task); + } + ++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) ++{ ++ debug_rt_mutex_init_waiter(waiter); ++ RB_CLEAR_NODE(&waiter->pi_tree_entry); ++ RB_CLEAR_NODE(&waiter->tree_entry); ++ waiter->task = NULL; ++} ++ + /** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take +@@ -1233,9 +1241,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, + unsigned long flags; + int ret = 0; + +- debug_rt_mutex_init_waiter(&waiter); +- RB_CLEAR_NODE(&waiter.pi_tree_entry); +- RB_CLEAR_NODE(&waiter.tree_entry); ++ rt_mutex_init_waiter(&waiter); + + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -103,6 +103,7 @@ extern void rt_mutex_init_proxy_locked(s + struct task_struct *proxy_owner); + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); ++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); + extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); diff --git a/patches/0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch b/patches/0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch new file mode 100644 index 000000000000..10af5d18f2ea --- /dev/null +++ b/patches/0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch @@ -0,0 +1,158 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:57 +0100 +Subject: [PATCH] futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock() + +Upstream commit 38d589f2fd08f1296aea3ce62bebd185125c6d81 + +With the ultimate goal of keeping rt_mutex wait_list and futex_q waiters +consistent it's necessary to split 'rt_mutex_futex_lock()' into finer +parts, such that only the actual blocking can be done without hb->lock +held. + +Split split_mutex_finish_proxy_lock() into two parts, one that does the +blocking and one that does remove_waiter() when the lock acquire failed. + +When the rtmutex was acquired successfully the waiter can be removed in the +acquisiton path safely, since there is no concurrency on the lock owner. + +This means that, except for futex_lock_pi(), all wait_list modifications +are done with both hb->lock and wait_lock held. + +[bigeasy@linutronix.de: fix for futex_requeue_pi_signal_restart] + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104152.001659630@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 7 +++-- + kernel/locking/rtmutex.c | 52 ++++++++++++++++++++++++++++++++++------ + kernel/locking/rtmutex_common.h | 8 +++--- + 3 files changed, 55 insertions(+), 12 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -3030,10 +3030,13 @@ static int futex_wait_requeue_pi(u32 __u + */ + WARN_ON(!q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; +- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); +- debug_rt_mutex_free_waiter(&rt_waiter); ++ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); + + spin_lock(q.lock_ptr); ++ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ++ ret = 0; ++ ++ debug_rt_mutex_free_waiter(&rt_waiter); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1743,21 +1743,23 @@ struct task_struct *rt_mutex_next_owner( + } + + /** +- * rt_mutex_finish_proxy_lock() - Complete lock acquisition ++ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * +- * Complete the lock acquisition started our behalf by another thread. ++ * Wait for the the lock acquisition started on our behalf by ++ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call ++ * rt_mutex_cleanup_proxy_lock(). + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT + * +- * Special API call for PI-futex requeue support ++ * Special API call for PI-futex support + */ +-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, ++int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter) + { +@@ -1770,9 +1772,6 @@ int rt_mutex_finish_proxy_lock(struct rt + /* sleep on the mutex */ + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + +- if (unlikely(ret)) +- remove_waiter(lock, waiter); +- + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. +@@ -1783,3 +1782,42 @@ int rt_mutex_finish_proxy_lock(struct rt + + return ret; + } ++ ++/** ++ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition ++ * @lock: the rt_mutex we were woken on ++ * @waiter: the pre-initialized rt_mutex_waiter ++ * ++ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). ++ * ++ * Unless we acquired the lock; we're still enqueued on the wait-list and can ++ * in fact still be granted ownership until we're removed. Therefore we can ++ * find we are in fact the owner and must disregard the ++ * rt_mutex_wait_proxy_lock() failure. ++ * ++ * Returns: ++ * true - did the cleanup, we done. ++ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, ++ * caller should disregards its return value. ++ * ++ * Special API call for PI-futex support ++ */ ++bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter) ++{ ++ bool cleanup = false; ++ ++ raw_spin_lock_irq(&lock->wait_lock); ++ /* ++ * Unless we're the owner; we're still enqueued on the wait_list. ++ * So check if we became owner, if not, take us off the wait_list. ++ */ ++ if (rt_mutex_owner(lock) != current) { ++ remove_waiter(lock, waiter); ++ fixup_rt_mutex_waiters(lock); ++ cleanup = true; ++ } ++ raw_spin_unlock_irq(&lock->wait_lock); ++ ++ return cleanup; ++} +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -107,9 +107,11 @@ extern void rt_mutex_init_waiter(struct + extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +- struct hrtimer_sleeper *to, +- struct rt_mutex_waiter *waiter); ++extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, ++ struct hrtimer_sleeper *to, ++ struct rt_mutex_waiter *waiter); ++extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter); + + extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); + extern int rt_mutex_futex_trylock(struct rt_mutex *l); diff --git a/patches/0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch b/patches/0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch new file mode 100644 index 000000000000..84017cee0304 --- /dev/null +++ b/patches/0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch @@ -0,0 +1,266 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:58 +0100 +Subject: [PATCH] futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock() + +Upstream commit cfafcd117da0216520568c195cb2f6cd1980c4bb + +By changing futex_lock_pi() to use rt_mutex_*_proxy_lock() all wait_list +modifications are done under both hb->lock and wait_lock. + +This closes the obvious interleave pattern between futex_lock_pi() and +futex_unlock_pi(), but not entirely so. See below: + +Before: + +futex_lock_pi() futex_unlock_pi() + unlock hb->lock + + lock hb->lock + unlock hb->lock + + lock rt_mutex->wait_lock + unlock rt_mutex_wait_lock + -EAGAIN + + lock rt_mutex->wait_lock + list_add + unlock rt_mutex->wait_lock + + schedule() + + lock rt_mutex->wait_lock + list_del + unlock rt_mutex->wait_lock + + <idem> + -EAGAIN + + lock hb->lock + + +After: + +futex_lock_pi() futex_unlock_pi() + + lock hb->lock + lock rt_mutex->wait_lock + list_add + unlock rt_mutex->wait_lock + unlock hb->lock + + schedule() + lock hb->lock + unlock hb->lock + lock hb->lock + lock rt_mutex->wait_lock + list_del + unlock rt_mutex->wait_lock + + lock rt_mutex->wait_lock + unlock rt_mutex_wait_lock + -EAGAIN + + unlock hb->lock + + +It does however solve the earlier starvation/live-lock scenario which got +introduced with the -EAGAIN since unlike the before scenario; where the +-EAGAIN happens while futex_unlock_pi() doesn't hold any locks; in the +after scenario it happens while futex_unlock_pi() actually holds a lock, +and then it is serialized on that lock. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104152.062785528@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 77 ++++++++++++++++++++++++++++------------ + kernel/locking/rtmutex.c | 26 +++---------- + kernel/locking/rtmutex_common.h | 1 + 3 files changed, 62 insertions(+), 42 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2097,20 +2097,7 @@ queue_unlock(struct futex_hash_bucket *h + hb_waiters_dec(hb); + } + +-/** +- * queue_me() - Enqueue the futex_q on the futex_hash_bucket +- * @q: The futex_q to enqueue +- * @hb: The destination hash bucket +- * +- * The hb->lock must be held by the caller, and is released here. A call to +- * queue_me() is typically paired with exactly one call to unqueue_me(). The +- * exceptions involve the PI related operations, which may use unqueue_me_pi() +- * or nothing if the unqueue is done as part of the wake process and the unqueue +- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for +- * an example). +- */ +-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +- __releases(&hb->lock) ++static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + { + int prio; + +@@ -2127,6 +2114,24 @@ static inline void queue_me(struct futex + plist_node_init(&q->list, prio); + plist_add(&q->list, &hb->chain); + q->task = current; ++} ++ ++/** ++ * queue_me() - Enqueue the futex_q on the futex_hash_bucket ++ * @q: The futex_q to enqueue ++ * @hb: The destination hash bucket ++ * ++ * The hb->lock must be held by the caller, and is released here. A call to ++ * queue_me() is typically paired with exactly one call to unqueue_me(). The ++ * exceptions involve the PI related operations, which may use unqueue_me_pi() ++ * or nothing if the unqueue is done as part of the wake process and the unqueue ++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for ++ * an example). ++ */ ++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) ++ __releases(&hb->lock) ++{ ++ __queue_me(q, hb); + spin_unlock(&hb->lock); + } + +@@ -2585,6 +2590,7 @@ static int futex_lock_pi(u32 __user *uad + { + struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; ++ struct rt_mutex_waiter rt_waiter; + struct futex_hash_bucket *hb; + struct futex_q q = futex_q_init; + int res, ret; +@@ -2637,25 +2643,52 @@ static int futex_lock_pi(u32 __user *uad + } + } + ++ WARN_ON(!q.pi_state); ++ + /* + * Only actually queue now that the atomic ops are done: + */ +- queue_me(&q, hb); ++ __queue_me(&q, hb); + +- WARN_ON(!q.pi_state); +- /* +- * Block on the PI mutex: +- */ +- if (!trylock) { +- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); +- } else { ++ if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); + /* Fixup the trylock return value: */ + ret = ret ? 0 : -EWOULDBLOCK; ++ goto no_block; + } + ++ /* ++ * We must add ourselves to the rt_mutex waitlist while holding hb->lock ++ * such that the hb and rt_mutex wait lists match. ++ */ ++ rt_mutex_init_waiter(&rt_waiter); ++ ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); ++ if (ret) { ++ if (ret == 1) ++ ret = 0; ++ ++ goto no_block; ++ } ++ ++ spin_unlock(q.lock_ptr); ++ ++ if (unlikely(to)) ++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); ++ ++ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); ++ + spin_lock(q.lock_ptr); + /* ++ * If we failed to acquire the lock (signal/timeout), we must ++ * first acquire the hb->lock before removing the lock from the ++ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex ++ * wait lists consistent. ++ */ ++ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) ++ ret = 0; ++ ++no_block: ++ /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1491,19 +1491,6 @@ int __sched rt_mutex_lock_interruptible( + EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); + + /* +- * Futex variant with full deadlock detection. +- * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock(). +- */ +-int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock, +- struct hrtimer_sleeper *timeout) +-{ +- might_sleep(); +- +- return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, +- timeout, RT_MUTEX_FULL_CHAINWALK); +-} +- +-/* + * Futex variant, must not use fastpath. + */ + int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) +@@ -1772,12 +1759,6 @@ int rt_mutex_wait_proxy_lock(struct rt_m + /* sleep on the mutex */ + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + +- /* +- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might +- * have to fix that up. +- */ +- fixup_rt_mutex_waiters(lock); +- + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +@@ -1817,6 +1798,13 @@ bool rt_mutex_cleanup_proxy_lock(struct + fixup_rt_mutex_waiters(lock); + cleanup = true; + } ++ ++ /* ++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might ++ * have to fix that up. ++ */ ++ fixup_rt_mutex_waiters(lock); ++ + raw_spin_unlock_irq(&lock->wait_lock); + + return cleanup; +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -113,7 +113,6 @@ extern int rt_mutex_wait_proxy_lock(stru + extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); + +-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); + extern int rt_mutex_futex_trylock(struct rt_mutex *l); + + extern void rt_mutex_futex_unlock(struct rt_mutex *lock); diff --git a/patches/0012-futex-Futex_unlock_pi-determinism.patch b/patches/0012-futex-Futex_unlock_pi-determinism.patch new file mode 100644 index 000000000000..9cd5ce650ff6 --- /dev/null +++ b/patches/0012-futex-Futex_unlock_pi-determinism.patch @@ -0,0 +1,80 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:35:59 +0100 +Subject: [PATCH] futex: Futex_unlock_pi() determinism + +Upstream commit bebe5b514345f09be2c15e414d076b02ecb9cce8 + +The problem with returning -EAGAIN when the waiter state mismatches is that +it becomes very hard to proof a bounded execution time on the +operation. And seeing that this is a RT operation, this is somewhat +important. + +While in practise; given the previous patch; it will be very unlikely to +ever really take more than one or two rounds, proving so becomes rather +hard. + +However, now that modifying wait_list is done while holding both hb->lock +and wait_lock, the scenario can be avoided entirely by acquiring wait_lock +while still holding hb-lock. Doing a hand-over, without leaving a hole. + +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: bigeasy@linutronix.de +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104152.112378812@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 24 +++++++++++------------- + 1 file changed, 11 insertions(+), 13 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -1396,15 +1396,10 @@ static int wake_futex_pi(u32 __user *uad + WAKE_Q(wake_q); + int ret = 0; + +- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); +- if (!new_owner) { ++ if (WARN_ON_ONCE(!new_owner)) { + /* +- * Since we held neither hb->lock nor wait_lock when coming +- * into this function, we could have raced with futex_lock_pi() +- * such that we might observe @this futex_q waiter, but the +- * rt_mutex's wait_list can be empty (either still, or again, +- * depending on which side we land). ++ * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by +@@ -2792,15 +2787,18 @@ static int futex_unlock_pi(u32 __user *u + if (pi_state->owner != current) + goto out_unlock; + ++ get_pi_state(pi_state); + /* +- * Grab a reference on the pi_state and drop hb->lock. ++ * Since modifying the wait_list is done while holding both ++ * hb->lock and wait_lock, holding either is sufficient to ++ * observe it. + * +- * The reference ensures pi_state lives, dropping the hb->lock +- * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to +- * close the races against futex_lock_pi(), but in case of +- * _any_ fail we'll abort and retry the whole deal. ++ * By taking wait_lock while still holding hb->lock, we ensure ++ * there is no point where we hold neither; and therefore ++ * wake_futex_pi() must observe a state consistent with what we ++ * observed. + */ +- get_pi_state(pi_state); ++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + ret = wake_futex_pi(uaddr, uval, pi_state); diff --git a/patches/0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch b/patches/0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch new file mode 100644 index 000000000000..2128174f26cd --- /dev/null +++ b/patches/0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch @@ -0,0 +1,203 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Wed, 22 Mar 2017 11:36:00 +0100 +Subject: [PATCH] futex: Drop hb->lock before enqueueing on the rtmutex + +Upstream commit 56222b212e8edb1cf51f5dd73ff645809b082b40 + +When PREEMPT_RT_FULL does the spinlock -> rt_mutex substitution the PI +chain code will (falsely) report a deadlock and BUG. + +The problem is that it hold hb->lock (now an rt_mutex) while doing +task_blocks_on_rt_mutex on the futex's pi_state::rtmutex. This, when +interleaved just right with futex_unlock_pi() leads it to believe to see an +AB-BA deadlock. + + Task1 (holds rt_mutex, Task2 (does FUTEX_LOCK_PI) + does FUTEX_UNLOCK_PI) + + lock hb->lock + lock rt_mutex (as per start_proxy) + lock hb->lock + +Which is a trivial AB-BA. + +It is not an actual deadlock, because it won't be holding hb->lock by the +time it actually blocks on the rt_mutex, but the chainwalk code doesn't +know that and it would be a nightmare to handle this gracefully. + +To avoid this problem, do the same as in futex_unlock_pi() and drop +hb->lock after acquiring wait_lock. This still fully serializes against +futex_unlock_pi(), since adding to the wait_list does the very same lock +dance, and removing it holds both locks. + +Aside of solving the RT problem this makes the lock and unlock mechanism +symetric and reduces the hb->lock held time. + +Reported-and-tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Suggested-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: juri.lelli@arm.com +Cc: xlpang@redhat.com +Cc: rostedt@goodmis.org +Cc: mathieu.desnoyers@efficios.com +Cc: jdesfossez@efficios.com +Cc: dvhart@infradead.org +Cc: bristot@redhat.com +Link: http://lkml.kernel.org/r/20170322104152.161341537@infradead.org +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 30 +++++++++++++++++------- + kernel/locking/rtmutex.c | 49 ++++++++++++++++++++++------------------ + kernel/locking/rtmutex_common.h | 3 ++ + 3 files changed, 52 insertions(+), 30 deletions(-) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2652,20 +2652,33 @@ static int futex_lock_pi(u32 __user *uad + goto no_block; + } + ++ rt_mutex_init_waiter(&rt_waiter); ++ + /* +- * We must add ourselves to the rt_mutex waitlist while holding hb->lock +- * such that the hb and rt_mutex wait lists match. ++ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not ++ * hold it while doing rt_mutex_start_proxy(), because then it will ++ * include hb->lock in the blocking chain, even through we'll not in ++ * fact hold it while blocking. This will lead it to report -EDEADLK ++ * and BUG when futex_unlock_pi() interleaves with this. ++ * ++ * Therefore acquire wait_lock while holding hb->lock, but drop the ++ * latter before calling rt_mutex_start_proxy_lock(). This still fully ++ * serializes against futex_unlock_pi() as that does the exact same ++ * lock handoff sequence. + */ +- rt_mutex_init_waiter(&rt_waiter); +- ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); ++ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); ++ spin_unlock(q.lock_ptr); ++ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); ++ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); ++ + if (ret) { + if (ret == 1) + ret = 0; + ++ spin_lock(q.lock_ptr); + goto no_block; + } + +- spin_unlock(q.lock_ptr); + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); +@@ -2678,6 +2691,9 @@ static int futex_lock_pi(u32 __user *uad + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. ++ * ++ * In particular; it is important that futex_unlock_pi() can not ++ * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; +@@ -2789,10 +2805,6 @@ static int futex_unlock_pi(u32 __user *u + + get_pi_state(pi_state); + /* +- * Since modifying the wait_list is done while holding both +- * hb->lock and wait_lock, holding either is sufficient to +- * observe it. +- * + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1659,31 +1659,14 @@ void rt_mutex_proxy_unlock(struct rt_mut + rt_mutex_set_owner(lock, NULL); + } + +-/** +- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task +- * @lock: the rt_mutex to take +- * @waiter: the pre-initialized rt_mutex_waiter +- * @task: the task to prepare +- * +- * Returns: +- * 0 - task blocked on lock +- * 1 - acquired the lock for task, caller should wake it up +- * <0 - error +- * +- * Special API call for FUTEX_REQUEUE_PI support. +- */ +-int rt_mutex_start_proxy_lock(struct rt_mutex *lock, ++int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) + { + int ret; + +- raw_spin_lock_irq(&lock->wait_lock); +- +- if (try_to_take_rt_mutex(lock, task, NULL)) { +- raw_spin_unlock_irq(&lock->wait_lock); ++ if (try_to_take_rt_mutex(lock, task, NULL)) + return 1; +- } + + /* We enforce deadlock detection for futexes */ + ret = task_blocks_on_rt_mutex(lock, waiter, task, +@@ -1702,12 +1685,36 @@ int rt_mutex_start_proxy_lock(struct rt_ + if (unlikely(ret)) + remove_waiter(lock, waiter); + +- raw_spin_unlock_irq(&lock->wait_lock); +- + debug_rt_mutex_print_deadlock(waiter); + + return ret; + } ++ ++/** ++ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task ++ * @lock: the rt_mutex to take ++ * @waiter: the pre-initialized rt_mutex_waiter ++ * @task: the task to prepare ++ * ++ * Returns: ++ * 0 - task blocked on lock ++ * 1 - acquired the lock for task, caller should wake it up ++ * <0 - error ++ * ++ * Special API call for FUTEX_REQUEUE_PI support. ++ */ ++int rt_mutex_start_proxy_lock(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ struct task_struct *task) ++{ ++ int ret; ++ ++ raw_spin_lock_irq(&lock->wait_lock); ++ ret = __rt_mutex_start_proxy_lock(lock, waiter, task); ++ raw_spin_unlock_irq(&lock->wait_lock); ++ ++ return ret; ++} + + /** + * rt_mutex_next_owner - return the next owner of the lock +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -104,6 +104,9 @@ extern void rt_mutex_init_proxy_locked(s + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); + extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); ++extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ++ struct rt_mutex_waiter *waiter, ++ struct task_struct *task); + extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); diff --git a/patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch b/patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch index 415a19d256ab..d0443c81bdaa 100644 --- a/patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch +++ b/patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch @@ -30,7 +30,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> --- a/kernel/futex.c +++ b/kernel/futex.c -@@ -904,7 +904,9 @@ void exit_pi_state_list(struct task_stru +@@ -909,7 +909,9 @@ void exit_pi_state_list(struct task_stru * task still owns the PI-state: */ if (head->next != next) { diff --git a/patches/futex-requeue-pi-fix.patch b/patches/futex-requeue-pi-fix.patch index 2719fd7e8926..e87a4fa978fc 100644 --- a/patches/futex-requeue-pi-fix.patch +++ b/patches/futex-requeue-pi-fix.patch @@ -65,9 +65,9 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } /* -@@ -1704,6 +1705,35 @@ int rt_mutex_start_proxy_lock(struct rt_ +@@ -1696,6 +1697,35 @@ int __rt_mutex_start_proxy_lock(struct r + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; - } +#ifdef CONFIG_PREEMPT_RT_FULL + /* diff --git a/patches/futex-workaround-migrate_disable-enable-in-different.patch b/patches/futex-workaround-migrate_disable-enable-in-different.patch new file mode 100644 index 000000000000..135c59df93c4 --- /dev/null +++ b/patches/futex-workaround-migrate_disable-enable-in-different.patch @@ -0,0 +1,58 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 8 Mar 2017 14:23:35 +0100 +Subject: [PATCH] futex: workaround migrate_disable/enable in different context + +migrate_disable()/migrate_enable() takes a different path in atomic() vs +!atomic() context. These little hacks ensure that we don't underflow / overflow +the migrate code counts properly while we lock the hb lockwith interrupts +enabled and unlock it with interrupts disabled. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/futex.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +--- a/kernel/futex.c ++++ b/kernel/futex.c +@@ -2667,9 +2667,18 @@ static int futex_lock_pi(u32 __user *uad + * lock handoff sequence. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); ++ /* ++ * the migrate_disable() here disables migration in the in_atomic() fast ++ * path which is enabled again in the following spin_unlock(). We have ++ * one migrate_disable() pending in the slow-path which is reversed ++ * after the raw_spin_unlock_irq() where we leave the atomic context. ++ */ ++ migrate_disable(); ++ + spin_unlock(q.lock_ptr); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); ++ migrate_enable(); + + if (ret) { + if (ret == 1) +@@ -2811,10 +2820,21 @@ static int futex_unlock_pi(u32 __user *u + * observed. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); ++ /* ++ * Magic trickery for now to make the RT migrate disable ++ * logic happy. The following spin_unlock() happens with ++ * interrupts disabled so the internal migrate_enable() ++ * won't undo the migrate_disable() which was issued when ++ * locking hb->lock. ++ */ ++ migrate_disable(); + spin_unlock(&hb->lock); + ++ /* Drops pi_state->pi_mutex.wait_lock */ + ret = wake_futex_pi(uaddr, uval, pi_state); + ++ migrate_enable(); ++ + put_pi_state(pi_state); + + /* diff --git a/patches/introduce_migrate_disable_cpu_light.patch b/patches/introduce_migrate_disable_cpu_light.patch index a4dd649cf76e..5eda023568c6 100644 --- a/patches/introduce_migrate_disable_cpu_light.patch +++ b/patches/introduce_migrate_disable_cpu_light.patch @@ -138,7 +138,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/include/linux/smp.h +++ b/include/linux/smp.h -@@ -185,6 +185,9 @@ static inline void smp_init(void) { } +@@ -197,6 +197,9 @@ static inline int get_boot_cpu_id(void) #define get_cpu() ({ preempt_disable(); smp_processor_id(); }) #define put_cpu() preempt_enable() diff --git a/patches/kernel-futex-don-t-deboost-too-early.patch b/patches/kernel-futex-don-t-deboost-too-early.patch deleted file mode 100644 index d902342dc9f1..000000000000 --- a/patches/kernel-futex-don-t-deboost-too-early.patch +++ /dev/null @@ -1,161 +0,0 @@ -From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> -Date: Thu, 29 Sep 2016 18:49:22 +0200 -Subject: [PATCH] kernel/futex: don't deboost too early - -The sequence: - T1 holds futex - T2 blocks on futex and boosts T1 - T1 unlocks futex and holds hb->lock - T1 unlocks rt mutex, so T1 has no more pi waiters - T3 blocks on hb->lock and adds itself to the pi waiters list of T1 - T1 unlocks hb->lock and deboosts itself - T4 preempts T1 so the wakeup of T2 gets delayed - -As a workaround I attempt here do unlock the hb->lock without a deboost -and perform the deboost after the wake up of the waiter. - -Cc: stable-rt@vger.kernel.org -Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> ---- - include/linux/spinlock.h | 6 ++++ - include/linux/spinlock_rt.h | 2 + - kernel/futex.c | 2 - - kernel/locking/rtmutex.c | 53 ++++++++++++++++++++++++++++++++++++++------ - 4 files changed, 55 insertions(+), 8 deletions(-) - ---- a/include/linux/spinlock.h -+++ b/include/linux/spinlock.h -@@ -355,6 +355,12 @@ static __always_inline void spin_unlock( - raw_spin_unlock(&lock->rlock); - } - -+static __always_inline int spin_unlock_no_deboost(spinlock_t *lock) -+{ -+ raw_spin_unlock(&lock->rlock); -+ return 0; -+} -+ - static __always_inline void spin_unlock_bh(spinlock_t *lock) - { - raw_spin_unlock_bh(&lock->rlock); ---- a/include/linux/spinlock_rt.h -+++ b/include/linux/spinlock_rt.h -@@ -26,6 +26,7 @@ extern void __lockfunc rt_spin_lock(spin - extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock); - extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); - extern void __lockfunc rt_spin_unlock(spinlock_t *lock); -+extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock); - extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); - extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); - extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock); -@@ -111,6 +112,7 @@ static inline unsigned long spin_lock_tr - #define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0) - - #define spin_unlock(lock) rt_spin_unlock(lock) -+#define spin_unlock_no_deboost(lock) rt_spin_unlock_no_deboost(lock) - - #define spin_unlock_bh(lock) \ - do { \ ---- a/kernel/futex.c -+++ b/kernel/futex.c -@@ -1377,7 +1377,7 @@ static int wake_futex_pi(u32 __user *uad - * deboost first (and lose our higher priority), then the task might get - * scheduled away before the wake up can take place. - */ -- spin_unlock(&hb->lock); -+ deboost |= spin_unlock_no_deboost(&hb->lock); - wake_up_q(&wake_q); - wake_up_q_sleeper(&wake_sleeper_q); - if (deboost) ---- a/kernel/locking/rtmutex.c -+++ b/kernel/locking/rtmutex.c -@@ -997,13 +997,14 @@ static inline void rt_spin_lock_fastlock - slowfn(lock); - } - --static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, -- void (*slowfn)(struct rt_mutex *lock)) -+static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock, -+ int (*slowfn)(struct rt_mutex *lock)) - { -- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) -+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); -- else -- slowfn(lock); -+ return 0; -+ } -+ return slowfn(lock); - } - #ifdef CONFIG_SMP - /* -@@ -1138,7 +1139,7 @@ static void mark_wakeup_next_waiter(stru - /* - * Slow path to release a rt_mutex spin_lock style - */ --static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) -+static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) - { - unsigned long flags; - WAKE_Q(wake_q); -@@ -1153,7 +1154,7 @@ static void noinline __sched rt_spin_lo - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); -- return; -+ return 0; - } - - mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock); -@@ -1164,6 +1165,33 @@ static void noinline __sched rt_spin_lo - - /* Undo pi boosting.when necessary */ - rt_mutex_adjust_prio(current); -+ return 0; -+} -+ -+static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock) -+{ -+ unsigned long flags; -+ WAKE_Q(wake_q); -+ WAKE_Q(wake_sleeper_q); -+ -+ raw_spin_lock_irqsave(&lock->wait_lock, flags); -+ -+ debug_rt_mutex_unlock(lock); -+ -+ rt_mutex_deadlock_account_unlock(current); -+ -+ if (!rt_mutex_has_waiters(lock)) { -+ lock->owner = NULL; -+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags); -+ return 0; -+ } -+ -+ mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock); -+ -+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags); -+ wake_up_q(&wake_q); -+ wake_up_q_sleeper(&wake_sleeper_q); -+ return 1; - } - - void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock) -@@ -1221,6 +1249,17 @@ void __lockfunc rt_spin_unlock(spinlock_ - } - EXPORT_SYMBOL(rt_spin_unlock); - -+int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock) -+{ -+ int ret; -+ -+ /* NOTE: we always pass in '1' for nested, for simplicity */ -+ spin_release(&lock->dep_map, 1, _RET_IP_); -+ ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost); -+ migrate_enable(); -+ return ret; -+} -+ - void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) - { - rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); diff --git a/patches/localversion.patch b/patches/localversion.patch index 25e5fadbaae8..e1f3b8d87864 100644 --- a/patches/localversion.patch +++ b/patches/localversion.patch @@ -10,4 +10,4 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ -+-rt13 ++-rt14 diff --git a/patches/lockdep-Fix-per-cpu-static-objects.patch b/patches/lockdep-Fix-per-cpu-static-objects.patch new file mode 100644 index 000000000000..b795b1481c55 --- /dev/null +++ b/patches/lockdep-Fix-per-cpu-static-objects.patch @@ -0,0 +1,124 @@ +From 8ce371f9846ef1e8b3cc8f6865766cb5c1f17e40 Mon Sep 17 00:00:00 2001 +From: Peter Zijlstra <peterz@infradead.org> +Date: Mon, 20 Mar 2017 12:26:55 +0100 +Subject: [PATCH] lockdep: Fix per-cpu static objects + +Since commit 383776fa7527 ("locking/lockdep: Handle statically initialized +PER_CPU locks properly") we try to collapse per-cpu locks into a single +class by giving them all the same key. For this key we choose the canonical +address of the per-cpu object, which would be the offset into the per-cpu +area. + +This has two problems: + + - there is a case where we run !0 lock->key through static_obj() and + expect this to pass; it doesn't for canonical pointers. + + - 0 is a valid canonical address. + +Cure both issues by redefining the canonical address as the address of the +per-cpu variable on the boot CPU. + +Since I didn't want to rely on CPU0 being the boot-cpu, or even existing at +all, track the boot CPU in a variable. + +Fixes: 383776fa7527 ("locking/lockdep: Handle statically initialized PER_CPU locks properly") +Reported-by: kernel test robot <fengguang.wu@intel.com> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Tested-by: Borislav Petkov <bp@suse.de> +Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Cc: linux-mm@kvack.org +Cc: wfg@linux.intel.com +Cc: kernel test robot <fengguang.wu@intel.com> +Cc: LKP <lkp@01.org> +Link: http://lkml.kernel.org/r/20170320114108.kbvcsuepem45j5cr@hirez.programming.kicks-ass.net +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + include/linux/smp.h | 12 ++++++++++++ + kernel/cpu.c | 6 ++++++ + kernel/module.c | 6 +++++- + mm/percpu.c | 5 ++++- + 4 files changed, 27 insertions(+), 2 deletions(-) + +--- a/include/linux/smp.h ++++ b/include/linux/smp.h +@@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus; + extern void __init setup_nr_cpu_ids(void); + extern void __init smp_init(void); + ++extern int __boot_cpu_id; ++ ++static inline int get_boot_cpu_id(void) ++{ ++ return __boot_cpu_id; ++} ++ + #else /* !SMP */ + + static inline void smp_send_stop(void) { } +@@ -158,6 +165,11 @@ static inline void smp_init(void) { up_l + static inline void smp_init(void) { } + #endif + ++static inline int get_boot_cpu_id(void) ++{ ++ return 0; ++} ++ + #endif /* !SMP */ + + /* +--- a/kernel/cpu.c ++++ b/kernel/cpu.c +@@ -1240,6 +1240,8 @@ core_initcall(cpu_hotplug_pm_sync_init); + + #endif /* CONFIG_PM_SLEEP_SMP */ + ++int __boot_cpu_id; ++ + #endif /* CONFIG_SMP */ + + /* Boot processor state steps */ +@@ -1923,6 +1925,10 @@ void __init boot_cpu_init(void) + set_cpu_active(cpu, true); + set_cpu_present(cpu, true); + set_cpu_possible(cpu, true); ++ ++#ifdef CONFIG_SMP ++ __boot_cpu_id = cpu; ++#endif + } + + /* +--- a/kernel/module.c ++++ b/kernel/module.c +@@ -677,8 +677,12 @@ bool __is_module_percpu_address(unsigned + void *va = (void *)addr; + + if (va >= start && va < start + mod->percpu_size) { +- if (can_addr) ++ if (can_addr) { + *can_addr = (unsigned long) (va - start); ++ *can_addr += (unsigned long) ++ per_cpu_ptr(mod->percpu, ++ get_boot_cpu_id()); ++ } + preempt_enable(); + return true; + } +--- a/mm/percpu.c ++++ b/mm/percpu.c +@@ -1295,8 +1295,11 @@ bool __is_kernel_percpu_address(unsigned + void *va = (void *)addr; + + if (va >= start && va < start + static_size) { +- if (can_addr) ++ if (can_addr) { + *can_addr = (unsigned long) (va - start); ++ *can_addr += (unsigned long) ++ per_cpu_ptr(base, get_boot_cpu_id()); ++ } + return true; + } + } diff --git a/patches/rt-add-rt-locks.patch b/patches/rt-add-rt-locks.patch index 274cfb6cc3bb..c5cd8758c714 100644 --- a/patches/rt-add-rt-locks.patch +++ b/patches/rt-add-rt-locks.patch @@ -24,15 +24,15 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> include/linux/spinlock_rt.h | 162 ++++++++++++ include/linux/spinlock_types.h | 11 include/linux/spinlock_types_rt.h | 48 +++ - kernel/futex.c | 10 + kernel/futex.c | 9 kernel/locking/Makefile | 9 kernel/locking/rt.c | 498 ++++++++++++++++++++++++++++++++++++++ - kernel/locking/rtmutex.c | 460 +++++++++++++++++++++++++++++++++-- - kernel/locking/rtmutex_common.h | 14 - + kernel/locking/rtmutex.c | 463 +++++++++++++++++++++++++++++++++-- + kernel/locking/rtmutex_common.h | 6 kernel/locking/spinlock.c | 7 kernel/locking/spinlock_debug.c | 5 kernel/sched/core.c | 7 - 23 files changed, 1658 insertions(+), 56 deletions(-) + 23 files changed, 1653 insertions(+), 55 deletions(-) --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -915,40 +915,45 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +#endif --- a/kernel/futex.c +++ b/kernel/futex.c -@@ -1301,6 +1301,7 @@ static int wake_futex_pi(u32 __user *uad - struct futex_pi_state *pi_state = this->pi_state; - u32 uninitialized_var(curval), newval; +@@ -1396,6 +1396,7 @@ static int wake_futex_pi(u32 __user *uad + struct task_struct *new_owner; + bool deboost = false; WAKE_Q(wake_q); + WAKE_Q(wake_sleeper_q); - bool deboost; int ret = 0; -@@ -1367,7 +1368,8 @@ static int wake_futex_pi(u32 __user *uad + new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); +@@ -1455,13 +1456,15 @@ static int wake_futex_pi(u32 __user *uad + /* + * We've updated the uservalue, this unlock cannot fail. + */ +- deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); ++ deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, ++ &wake_sleeper_q); + out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); -- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); -+ deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, -+ &wake_sleeper_q); - - /* - * First unlock HB so the waiter does not spin on it once he got woken -@@ -1377,6 +1379,7 @@ static int wake_futex_pi(u32 __user *uad - */ - spin_unlock(&hb->lock); - wake_up_q(&wake_q); -+ wake_up_q_sleeper(&wake_sleeper_q); - if (deboost) + if (deboost) { + wake_up_q(&wake_q); ++ wake_up_q_sleeper(&wake_sleeper_q); rt_mutex_adjust_prio(current); + } + +@@ -2664,7 +2667,7 @@ static int futex_lock_pi(u32 __user *uad + goto no_block; + } + +- rt_mutex_init_waiter(&rt_waiter); ++ rt_mutex_init_waiter(&rt_waiter, false); -@@ -2850,10 +2853,7 @@ static int futex_wait_requeue_pi(u32 __u + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not +@@ -3029,7 +3032,7 @@ static int futex_wait_requeue_pi(u32 __u * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ -- debug_rt_mutex_init_waiter(&rt_waiter); -- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry); -- RB_CLEAR_NODE(&rt_waiter.tree_entry); -- rt_waiter.task = NULL; +- rt_mutex_init_waiter(&rt_waiter); + rt_mutex_init_waiter(&rt_waiter, false); ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE); @@ -1604,7 +1609,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* * The current top waiter stays enqueued. We * don't have to change anything in the lock -@@ -948,6 +982,352 @@ static int try_to_take_rt_mutex(struct r +@@ -946,6 +980,350 @@ static int try_to_take_rt_mutex(struct r return 1; } @@ -1618,7 +1623,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> + might_sleep_no_state_check(); + + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) -+ rt_mutex_deadlock_account_lock(lock, current); ++ return; + else + slowfn(lock); +} @@ -1627,7 +1632,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) -+ rt_mutex_deadlock_account_unlock(current); ++ return; + else + slowfn(lock); +} @@ -1774,8 +1779,6 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> + + debug_rt_mutex_unlock(lock); + -+ rt_mutex_deadlock_account_unlock(current); -+ + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -1957,7 +1960,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* * Task blocks on lock. * -@@ -1060,6 +1440,7 @@ static int task_blocks_on_rt_mutex(struc +@@ -1058,6 +1436,7 @@ static int task_blocks_on_rt_mutex(struc * Called with lock->wait_lock held and interrupts disabled. */ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, @@ -1965,7 +1968,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; -@@ -1088,7 +1469,10 @@ static void mark_wakeup_next_waiter(stru +@@ -1086,7 +1465,10 @@ static void mark_wakeup_next_waiter(stru raw_spin_unlock(¤t->pi_lock); @@ -1977,7 +1980,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } /* -@@ -1169,11 +1553,11 @@ void rt_mutex_adjust_pi(struct task_stru +@@ -1167,21 +1549,22 @@ void rt_mutex_adjust_pi(struct task_stru return; } next_lock = waiter->lock; @@ -1990,18 +1993,28 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, next_lock, NULL, task); } -@@ -1260,9 +1644,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, + +-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) ++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) + { + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; ++ waiter->savestate = savestate; + } + + /** +@@ -1266,7 +1649,7 @@ rt_mutex_slowlock(struct rt_mutex *lock, unsigned long flags; int ret = 0; -- debug_rt_mutex_init_waiter(&waiter); -- RB_CLEAR_NODE(&waiter.pi_tree_entry); -- RB_CLEAR_NODE(&waiter.tree_entry); +- rt_mutex_init_waiter(&waiter); + rt_mutex_init_waiter(&waiter, false); /* * Technically we could use raw_spin_[un]lock_irq() here, but this can -@@ -1356,7 +1738,8 @@ static inline int rt_mutex_slowtrylock(s +@@ -1360,7 +1743,8 @@ static inline int rt_mutex_slowtrylock(s * Return whether the current task needs to undo a potential priority boosting. */ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, @@ -2011,7 +2024,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> { unsigned long flags; -@@ -1412,7 +1795,7 @@ static bool __sched rt_mutex_slowunlock( +@@ -1414,7 +1798,7 @@ static bool __sched rt_mutex_slowunlock( * * Queue the next waiter for wakeup once we release the wait_lock. */ @@ -2020,7 +2033,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> raw_spin_unlock_irqrestore(&lock->wait_lock, flags); -@@ -1469,17 +1852,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lo +@@ -1468,17 +1852,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lo static inline void rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, @@ -2030,37 +2043,56 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> { WAKE_Q(wake_q); + WAKE_Q(wake_sleeper_q); + bool deboost; - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - } else { -- bool deboost = slowfn(lock, &wake_q); -+ bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q); +- deboost = slowfn(lock, &wake_q); ++ deboost = slowfn(lock, &wake_q, &wake_sleeper_q); - wake_up_q(&wake_q); -+ wake_up_q_sleeper(&wake_sleeper_q); + wake_up_q(&wake_q); ++ wake_up_q_sleeper(&wake_sleeper_q); - /* Undo pi boosting if necessary: */ - if (deboost) -@@ -1616,13 +2002,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); - * required or not. + /* Undo pi boosting if necessary: */ + if (deboost) +@@ -1606,7 +1993,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock); + * simple and will not need to retry. */ - bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, -- struct wake_q_head *wqh) -+ struct wake_q_head *wqh, -+ struct wake_q_head *wq_sleeper) + bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, +- struct wake_q_head *wake_q) ++ struct wake_q_head *wake_q, ++ struct wake_q_head *wq_sleeper) { - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - return false; + lockdep_assert_held(&lock->wait_lock); + +@@ -1617,21 +2005,23 @@ bool __sched __rt_mutex_futex_unlock(str + return false; /* done */ } -- return rt_mutex_slowunlock(lock, wqh); -+ return rt_mutex_slowunlock(lock, wqh, wq_sleeper); + +- mark_wakeup_next_waiter(wake_q, lock); ++ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock); + return true; /* deboost and wakeups */ } - /** -@@ -1655,13 +2042,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); + void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) + { + WAKE_Q(wake_q); ++ WAKE_Q(wake_sleeper_q); + bool deboost; + + raw_spin_lock_irq(&lock->wait_lock); +- deboost = __rt_mutex_futex_unlock(lock, &wake_q); ++ deboost = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q); + raw_spin_unlock_irq(&lock->wait_lock); + + if (deboost) { + wake_up_q(&wake_q); ++ wake_up_q_sleeper(&wake_sleeper_q); + rt_mutex_adjust_prio(current); + } + } +@@ -1666,13 +2056,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy); void __rt_mutex_init(struct rt_mutex *lock, const char *name) { lock->owner = NULL; @@ -2075,7 +2107,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /** * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a -@@ -1676,7 +2062,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); +@@ -1687,7 +2076,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { @@ -2083,10 +2115,10 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> + rt_mutex_init(lock); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); -@@ -1838,3 +2224,25 @@ int rt_mutex_finish_proxy_lock(struct rt + } +@@ -1893,3 +2282,25 @@ bool rt_mutex_cleanup_proxy_lock(struct - return ret; + return cleanup; } + +#ifdef CONFIG_PREEMPT_RT_FULL @@ -2120,31 +2152,25 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> #ifdef CONFIG_DEBUG_RT_MUTEXES unsigned long ip; struct pid *deadlock_task_pid; -@@ -114,7 +115,8 @@ extern int rt_mutex_finish_proxy_lock(st - struct rt_mutex_waiter *waiter); - extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); - extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, -- struct wake_q_head *wqh); -+ struct wake_q_head *wqh, -+ struct wake_q_head *wq_sleeper); - extern void rt_mutex_adjust_prio(struct task_struct *task); +@@ -106,7 +107,7 @@ extern void rt_mutex_init_proxy_locked(s + struct task_struct *proxy_owner); + extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, + struct task_struct *proxy_owner); +-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); ++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate); + extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); +@@ -123,7 +124,8 @@ extern int rt_mutex_futex_trylock(struct + + extern void rt_mutex_futex_unlock(struct rt_mutex *lock); + extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, +- struct wake_q_head *wqh); ++ struct wake_q_head *wqh, ++ struct wake_q_head *wq_sleeper); - #ifdef CONFIG_DEBUG_RT_MUTEXES -@@ -123,4 +125,14 @@ extern void rt_mutex_adjust_prio(struct - # include "rtmutex.h" - #endif + extern void rt_mutex_adjust_prio(struct task_struct *task); -+static inline void -+rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) -+{ -+ debug_rt_mutex_init_waiter(waiter); -+ waiter->task = NULL; -+ waiter->savestate = savestate; -+ RB_CLEAR_NODE(&waiter->pi_tree_entry); -+ RB_CLEAR_NODE(&waiter->tree_entry); -+} -+ - #endif --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(loc diff --git a/patches/rt-locking-Reenable-migration-accross-schedule.patch b/patches/rt-locking-Reenable-migration-accross-schedule.patch index 9b386af9fead..7ef4dfeb89c2 100644 --- a/patches/rt-locking-Reenable-migration-accross-schedule.patch +++ b/patches/rt-locking-Reenable-migration-accross-schedule.patch @@ -18,7 +18,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c -@@ -988,14 +988,19 @@ static int __try_to_take_rt_mutex(struct +@@ -986,14 +986,19 @@ static int __try_to_take_rt_mutex(struct * preemptible spin_lock functions: */ static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, @@ -33,14 +33,14 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> + migrate_disable(); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - rt_mutex_deadlock_account_lock(lock, current); + return; else - slowfn(lock); + slowfn(lock, do_mig_dis); } - static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock, -@@ -1054,7 +1059,8 @@ static int task_blocks_on_rt_mutex(struc + static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, +@@ -1051,7 +1056,8 @@ static int task_blocks_on_rt_mutex(struc * We store the current state under p->pi_lock in p->saved_state and * the try_to_wake_up() code handles this accordingly. */ @@ -50,7 +50,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> { struct task_struct *lock_owner, *self = current; struct rt_mutex_waiter waiter, *top_waiter; -@@ -1098,8 +1104,13 @@ static void noinline __sched rt_spin_lo +@@ -1095,8 +1101,13 @@ static void noinline __sched rt_spin_lo debug_rt_mutex_print_deadlock(&waiter); @@ -65,7 +65,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> raw_spin_lock_irqsave(&lock->wait_lock, flags); -@@ -1197,38 +1208,35 @@ static int noinline __sched rt_spin_lock +@@ -1165,38 +1176,35 @@ static void noinline __sched rt_spin_lo void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock) { diff --git a/patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch b/patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch index b05524f25aeb..bbb8795771ae 100644 --- a/patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch +++ b/patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch @@ -21,7 +21,7 @@ Cc: stable-rt@vger.kernel.org --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c -@@ -1690,7 +1690,7 @@ int rt_mutex_start_proxy_lock(struct rt_ +@@ -1682,7 +1682,7 @@ int __rt_mutex_start_proxy_lock(struct r ret = 0; } @@ -29,4 +29,4 @@ Cc: stable-rt@vger.kernel.org + if (ret && rt_mutex_has_waiters(lock)) remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); + debug_rt_mutex_print_deadlock(waiter); diff --git a/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch b/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch index 56afc2458734..68142ad38c64 100644 --- a/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch +++ b/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch @@ -22,8 +22,8 @@ lockdep says: Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> --- - kernel/locking/rtmutex.c | 251 ++++++++++++++++++++++++++++++++++++++++++----- - 1 file changed, 226 insertions(+), 25 deletions(-) + kernel/locking/rtmutex.c | 248 ++++++++++++++++++++++++++++++++++++++++++----- + 1 file changed, 224 insertions(+), 24 deletions(-) --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -35,7 +35,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> #include "rtmutex_common.h" -@@ -1360,6 +1361,40 @@ EXPORT_SYMBOL(__rt_spin_lock_init); +@@ -1317,6 +1318,40 @@ EXPORT_SYMBOL(__rt_spin_lock_init); #endif /* PREEMPT_RT_FULL */ @@ -76,7 +76,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> static inline int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, struct rt_mutex_waiter *waiter) -@@ -1614,7 +1649,8 @@ void rt_mutex_adjust_pi(struct task_stru +@@ -1580,7 +1615,8 @@ void rt_mutex_init_waiter(struct rt_mute static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, @@ -86,7 +86,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> { int ret = 0; -@@ -1637,6 +1673,12 @@ static int __sched +@@ -1603,6 +1639,12 @@ static int __sched break; } @@ -99,7 +99,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> raw_spin_unlock_irq(&lock->wait_lock); debug_rt_mutex_print_deadlock(waiter); -@@ -1671,13 +1713,90 @@ static void rt_mutex_handle_deadlock(int +@@ -1637,13 +1679,90 @@ static void rt_mutex_handle_deadlock(int } } @@ -191,7 +191,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> { struct rt_mutex_waiter waiter; unsigned long flags; -@@ -1697,6 +1816,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, +@@ -1663,6 +1782,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { @@ -200,7 +200,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return 0; } -@@ -1711,13 +1832,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, +@@ -1677,13 +1798,23 @@ rt_mutex_slowlock(struct rt_mutex *lock, if (likely(!ret)) /* sleep on the mutex */ @@ -226,7 +226,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> } /* -@@ -1850,31 +1981,36 @@ static bool __sched rt_mutex_slowunlock( +@@ -1814,29 +1945,33 @@ static bool __sched rt_mutex_slowunlock( */ static inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, @@ -237,13 +237,11 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> + enum rtmutex_chainwalk chwalk, + struct ww_acquire_ctx *ww_ctx)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else -- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); -+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, -+ ww_ctx); + +- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); ++ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx); } static inline int @@ -258,16 +256,15 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> + struct ww_acquire_ctx *ww_ctx)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else -- return slowfn(lock, state, timeout, chwalk); -+ return slowfn(lock, state, timeout, chwalk, ww_ctx); + +- return slowfn(lock, state, timeout, chwalk); ++ return slowfn(lock, state, timeout, chwalk, ww_ctx); } static inline int -@@ -1921,7 +2057,7 @@ void __sched rt_mutex_lock(struct rt_mut +@@ -1881,7 +2016,7 @@ void __sched rt_mutex_lock(struct rt_mut { might_sleep(); @@ -276,7 +273,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> } EXPORT_SYMBOL_GPL(rt_mutex_lock); -@@ -1938,7 +2074,7 @@ int __sched rt_mutex_lock_interruptible( +@@ -1898,7 +2033,7 @@ int __sched rt_mutex_lock_interruptible( { might_sleep(); @@ -285,16 +282,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); -@@ -1951,7 +2087,7 @@ int rt_mutex_timed_futex_lock(struct rt_ - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, -- RT_MUTEX_FULL_CHAINWALK, -+ RT_MUTEX_FULL_CHAINWALK, NULL, - rt_mutex_slowlock); - } - -@@ -1970,7 +2106,7 @@ int __sched rt_mutex_lock_killable(struc +@@ -1925,7 +2060,7 @@ int __sched rt_mutex_lock_killable(struc { might_sleep(); @@ -303,7 +291,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> } EXPORT_SYMBOL_GPL(rt_mutex_lock_killable); -@@ -1994,6 +2130,7 @@ rt_mutex_timed_lock(struct rt_mutex *loc +@@ -1949,6 +2084,7 @@ rt_mutex_timed_lock(struct rt_mutex *loc return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, RT_MUTEX_MIN_CHAINWALK, @@ -311,17 +299,17 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); -@@ -2248,7 +2385,7 @@ int rt_mutex_finish_proxy_lock(struct rt +@@ -2230,7 +2366,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m set_current_state(TASK_INTERRUPTIBLE); /* sleep on the mutex */ - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); - if (unlikely(ret)) - remove_waiter(lock, waiter); -@@ -2264,24 +2401,88 @@ int rt_mutex_finish_proxy_lock(struct rt - return ret; + raw_spin_unlock_irq(&lock->wait_lock); + +@@ -2283,24 +2419,88 @@ bool rt_mutex_cleanup_proxy_lock(struct + return cleanup; } -#ifdef CONFIG_PREEMPT_RT_FULL diff --git a/patches/rtmutex-futex-prepare-rt.patch b/patches/rtmutex-futex-prepare-rt.patch index 3d966f0febbe..6186521366c1 100644 --- a/patches/rtmutex-futex-prepare-rt.patch +++ b/patches/rtmutex-futex-prepare-rt.patch @@ -15,7 +15,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/kernel/futex.c +++ b/kernel/futex.c -@@ -1924,6 +1924,16 @@ static int futex_requeue(u32 __user *uad +@@ -2009,6 +2009,16 @@ static int futex_requeue(u32 __user *uad requeue_pi_wake_futex(this, &key2, hb2); drop_count++; continue; @@ -32,16 +32,16 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } else if (ret) { /* * rt_mutex_start_proxy_lock() detected a -@@ -2813,7 +2823,7 @@ static int futex_wait_requeue_pi(u32 __u - { +@@ -2992,7 +3002,7 @@ static int futex_wait_requeue_pi(u32 __u struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; + struct futex_hash_bucket *hb, *hb2; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; int res, ret; -@@ -2872,20 +2882,55 @@ static int futex_wait_requeue_pi(u32 __u +@@ -3048,20 +3058,55 @@ static int futex_wait_requeue_pi(u32 __u /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); @@ -108,7 +108,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* Check if the requeue code acquired the second futex for us. */ if (!q.rt_waiter) { -@@ -2894,7 +2939,8 @@ static int futex_wait_requeue_pi(u32 __u +@@ -3070,7 +3115,8 @@ static int futex_wait_requeue_pi(u32 __u * did a lock-steal - fix up the PI-state in that case. */ if (q.pi_state && (q.pi_state->owner != current)) { @@ -116,9 +116,9 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> + spin_lock(&hb2->lock); + BUG_ON(&hb2->lock != q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) - rt_mutex_unlock(&q.pi_state->pi_mutex); -@@ -2903,7 +2949,7 @@ static int futex_wait_requeue_pi(u32 __u + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; +@@ -3081,7 +3127,7 @@ static int futex_wait_requeue_pi(u32 __u * the requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); @@ -127,16 +127,16 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } } else { struct rt_mutex *pi_mutex; -@@ -2918,7 +2964,8 @@ static int futex_wait_requeue_pi(u32 __u - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); - debug_rt_mutex_free_waiter(&rt_waiter); +@@ -3095,7 +3141,8 @@ static int futex_wait_requeue_pi(u32 __u + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); - spin_lock(q.lock_ptr); + spin_lock(&hb2->lock); + BUG_ON(&hb2->lock != q.lock_ptr); - /* - * Fixup the pi_state owner and possibly acquire the lock if we - * haven't already. + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -133,6 +133,11 @@ static void fixup_rt_mutex_waiters(struc @@ -170,7 +170,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> goto out_unlock_pi; /* -@@ -971,6 +977,23 @@ static int task_blocks_on_rt_mutex(struc +@@ -969,6 +975,23 @@ static int task_blocks_on_rt_mutex(struc return -EDEADLK; raw_spin_lock(&task->pi_lock); @@ -194,7 +194,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> __rt_mutex_adjust_prio(task); waiter->task = task; waiter->lock = lock; -@@ -994,7 +1017,7 @@ static int task_blocks_on_rt_mutex(struc +@@ -992,7 +1015,7 @@ static int task_blocks_on_rt_mutex(struc rt_mutex_enqueue_pi(owner, waiter); __rt_mutex_adjust_prio(owner); @@ -203,7 +203,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { chain_walk = 1; -@@ -1078,7 +1101,7 @@ static void remove_waiter(struct rt_mute +@@ -1076,7 +1099,7 @@ static void remove_waiter(struct rt_mute { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); @@ -212,7 +212,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> raw_spin_lock(¤t->pi_lock); rt_mutex_dequeue(lock, waiter); -@@ -1102,7 +1125,8 @@ static void remove_waiter(struct rt_mute +@@ -1100,7 +1123,8 @@ static void remove_waiter(struct rt_mute __rt_mutex_adjust_prio(owner); /* Store the lock on which owner is blocked or NULL */ @@ -222,7 +222,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> raw_spin_unlock(&owner->pi_lock); -@@ -1138,7 +1162,7 @@ void rt_mutex_adjust_pi(struct task_stru +@@ -1136,7 +1160,7 @@ void rt_mutex_adjust_pi(struct task_stru raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; diff --git a/patches/rtmutex-lock-killable.patch b/patches/rtmutex-lock-killable.patch index ac46c07b973b..1d8a14060569 100644 --- a/patches/rtmutex-lock-killable.patch +++ b/patches/rtmutex-lock-killable.patch @@ -23,7 +23,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c -@@ -1531,6 +1531,25 @@ int rt_mutex_timed_futex_lock(struct rt_ +@@ -1524,6 +1524,25 @@ int __sched rt_mutex_futex_trylock(struc } /** diff --git a/patches/rtmutex-trylock-is-okay-on-RT.patch b/patches/rtmutex-trylock-is-okay-on-RT.patch index 1aa6206dbc90..e87897594e44 100644 --- a/patches/rtmutex-trylock-is-okay-on-RT.patch +++ b/patches/rtmutex-trylock-is-okay-on-RT.patch @@ -13,7 +13,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c -@@ -1542,7 +1542,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); +@@ -1535,7 +1535,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); */ int __sched rt_mutex_trylock(struct rt_mutex *lock) { diff --git a/patches/series b/patches/series index 3766a2fbe2dc..137fcb9ca0f8 100644 --- a/patches/series +++ b/patches/series @@ -17,6 +17,21 @@ timer-make-the-base-lock-raw.patch ############################################################ lockdep-Handle-statically-initialized-PER_CPU-locks-.patch lockdep-Fix-compilation-error-for-CONFIG_MODULES-and.patch +lockdep-Fix-per-cpu-static-objects.patch + +0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch +0002-futex-Use-smp_store_release-in-mark_wake_futex.patch +0003-futex-Remove-rt_mutex_deadlock_account_.patch +0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch +0005-futex-Change-locking-rules.patch +0006-futex-Cleanup-refcounting.patch +0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch +0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch +0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch +0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch +0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch +0012-futex-Futex_unlock_pi-determinism.patch +0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch # Those two should vanish soon (not use PIT during bootup) at91_dont_enable_disable_clock.patch @@ -183,6 +198,7 @@ preempt-nort-rt-variants.patch # local locks & migrate disable introduce_migrate_disable_cpu_light.patch +futex-workaround-migrate_disable-enable-in-different.patch rt-local-irq-lock.patch locallock-add-local_lock_on.patch @@ -333,7 +349,6 @@ rtmutex-avoid-include-hell.patch rtmutex_dont_include_rcu.patch rt-add-rt-locks.patch rt-drop_mutex_disable_on_not_debug.patch -kernel-futex-don-t-deboost-too-early.patch rtmutex-add-a-first-shot-of-ww_mutex.patch ptrace-fix-ptrace-vs-tasklist_lock-race.patch |