summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Andrzej Siewior <bigeasy@linutronix.de>2017-03-28 13:06:54 +0200
committerSebastian Andrzej Siewior <bigeasy@linutronix.de>2017-03-28 13:06:54 +0200
commite6a76dc34a7be542641b32770e0ca3e29f507bb9 (patch)
tree0e10df8c17709cad27af0af4281b5cdd56b2b9c8
parent950728f5efcdc960f3274cb3206418d0178c866c (diff)
downloadlinux-rt-e6a76dc34a7be542641b32770e0ca3e29f507bb9.tar.gz
[ANNOUNCE] v4.9.18-rt14v4.9.18-rt14-patches
Dear RT folks! I'm pleased to announce the v4.9.18-rt14 patch set. Changes since v4.9.18-rt13: - v4.9.11-rt9 had a fix for statically initialized PER_CPU locks. An issue with nested locks came up which was noticed by the kernel test robot and fixed by Peter Zijlstra. - A larger rework of the futex / rtmutex code. In v4.8-rt1 we added a workaround so we don't de-boost too early in the unlock path. A small window remained in which the locking thread could de-boost the unlocking thread. This rework by Peter Zijlstra fixes the issue. Known issues - CPU hotplug got a little better but can deadlock. - The radeon driver. Probably since a change in the driver (or DRM core) the radeon driver can hang. This problem starts probably with the v3.18 release. - gdb. While gdb is following a task it is possible that after a fork() operation the task is waiting for gdb and gdb waiting for the task. The delta patch against v4.9.18-rt13 is appended below and can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/incr/patch-4.9.18-rt13-rt14.patch.xz You can get this release via the git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v4.9.18-rt14 The RT patch against v4.9.18 can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patch-4.9.18-rt14.patch.xz The split quilt queue is available at: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patches-4.9.18-rt14.tar.xz Sebastian diff --git a/include/linux/smp.h b/include/linux/smp.h --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus; extern void __init setup_nr_cpu_ids(void); extern void __init smp_init(void); +extern int __boot_cpu_id; + +static inline int get_boot_cpu_id(void) +{ + return __boot_cpu_id; +} + #else /* !SMP */ static inline void smp_send_stop(void) { } @@ -158,6 +165,11 @@ static inline void smp_init(void) { up_late_init(); } static inline void smp_init(void) { } #endif +static inline int get_boot_cpu_id(void) +{ + return 0; +} + #endif /* !SMP */ /* diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -355,12 +355,6 @@ static __always_inline void spin_unlock(spinlock_t *lock) raw_spin_unlock(&lock->rlock); } -static __always_inline int spin_unlock_no_deboost(spinlock_t *lock) -{ - raw_spin_unlock(&lock->rlock); - return 0; -} - static __always_inline void spin_unlock_bh(spinlock_t *lock) { raw_spin_unlock_bh(&lock->rlock); diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h --- a/include/linux/spinlock_rt.h +++ b/include/linux/spinlock_rt.h @@ -26,7 +26,6 @@ extern void __lockfunc rt_spin_lock(spinlock_t *lock); extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock); extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); extern void __lockfunc rt_spin_unlock(spinlock_t *lock); -extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock); extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock); extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock); @@ -112,7 +111,6 @@ static inline unsigned long spin_lock_trace_flags(spinlock_t *lock) #define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0) #define spin_unlock(lock) rt_spin_unlock(lock) -#define spin_unlock_no_deboost(lock) rt_spin_unlock_no_deboost(lock) #define spin_unlock_bh(lock) \ do { \ diff --git a/kernel/cpu.c b/kernel/cpu.c --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1562,6 +1562,8 @@ core_initcall(cpu_hotplug_pm_sync_init); #endif /* CONFIG_PM_SLEEP_SMP */ +int __boot_cpu_id; + #endif /* CONFIG_SMP */ /* Boot processor state steps */ @@ -2245,6 +2247,10 @@ void __init boot_cpu_init(void) set_cpu_active(cpu, true); set_cpu_present(cpu, true); set_cpu_possible(cpu, true); + +#ifdef CONFIG_SMP + __boot_cpu_id = cpu; +#endif } /* diff --git a/kernel/futex.c b/kernel/futex.c --- a/kernel/futex.c +++ b/kernel/futex.c @@ -800,7 +800,7 @@ static int refill_pi_state_cache(void) return 0; } -static struct futex_pi_state * alloc_pi_state(void) +static struct futex_pi_state *alloc_pi_state(void) { struct futex_pi_state *pi_state = current->pi_state_cache; @@ -810,6 +810,11 @@ static struct futex_pi_state * alloc_pi_state(void) return pi_state; } +static void get_pi_state(struct futex_pi_state *pi_state) +{ + WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount)); +} + /* * Drops a reference to the pi_state object and frees or caches it * when the last reference is gone. @@ -854,7 +859,7 @@ static void put_pi_state(struct futex_pi_state *pi_state) * Look up the task based on what TID userspace gave us. * We dont trust it. */ -static struct task_struct * futex_find_get_task(pid_t pid) +static struct task_struct *futex_find_get_task(pid_t pid) { struct task_struct *p; @@ -916,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr) pi_state->owner = NULL; raw_spin_unlock_irq(&curr->pi_lock); - rt_mutex_unlock(&pi_state->pi_mutex); - + get_pi_state(pi_state); spin_unlock(&hb->lock); + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + raw_spin_lock_irq(&curr->pi_lock); } raw_spin_unlock_irq(&curr->pi_lock); @@ -973,6 +980,39 @@ void exit_pi_state_list(struct task_struct *curr) * * [10] There is no transient state which leaves owner and user space * TID out of sync. + * + * + * Serialization and lifetime rules: + * + * hb->lock: + * + * hb -> futex_q, relation + * futex_q -> pi_state, relation + * + * (cannot be raw because hb can contain arbitrary amount + * of futex_q's) + * + * pi_mutex->wait_lock: + * + * {uval, pi_state} + * + * (and pi_mutex 'obviously') + * + * p->pi_lock: + * + * p->pi_state_list -> pi_state->list, relation + * + * pi_state->refcount: + * + * pi_state lifetime + * + * + * Lock order: + * + * hb->lock + * pi_mutex->wait_lock + * p->pi_lock + * */ /* @@ -980,10 +1020,12 @@ void exit_pi_state_list(struct task_struct *curr) * the pi_state against the user space value. If correct, attach to * it. */ -static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, +static int attach_to_pi_state(u32 __user *uaddr, u32 uval, + struct futex_pi_state *pi_state, struct futex_pi_state **ps) { pid_t pid = uval & FUTEX_TID_MASK; + int ret, uval2; /* * Userspace might have messed up non-PI and PI futexes [3] @@ -991,9 +1033,39 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, if (unlikely(!pi_state)) return -EINVAL; + /* + * We get here with hb->lock held, and having found a + * futex_top_waiter(). This means that futex_lock_pi() of said futex_q + * has dropped the hb->lock in between queue_me() and unqueue_me_pi(), + * which in turn means that futex_lock_pi() still has a reference on + * our pi_state. + * + * The waiter holding a reference on @pi_state also protects against + * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi() + * and futex_wait_requeue_pi() as it cannot go to 0 and consequently + * free pi_state before we can take a reference ourselves. + */ WARN_ON(!atomic_read(&pi_state->refcount)); /* + * Now that we have a pi_state, we can acquire wait_lock + * and do the state validation. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + /* + * Since {uval, pi_state} is serialized by wait_lock, and our current + * uval was read without holding it, it can have changed. Verify it + * still is what we expect it to be, otherwise retry the entire + * operation. + */ + if (get_futex_value_locked(&uval2, uaddr)) + goto out_efault; + + if (uval != uval2) + goto out_eagain; + + /* * Handle the owner died case: */ if (uval & FUTEX_OWNER_DIED) { @@ -1008,11 +1080,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * is not 0. Inconsistent state. [5] */ if (pid) - return -EINVAL; + goto out_einval; /* * Take a ref on the state and return success. [4] */ - goto out_state; + goto out_attach; } /* @@ -1024,14 +1096,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * Take a ref on the state and return success. [6] */ if (!pid) - goto out_state; + goto out_attach; } else { /* * If the owner died bit is not set, then the pi_state * must have an owner. [7] */ if (!pi_state->owner) - return -EINVAL; + goto out_einval; } /* @@ -1040,11 +1112,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state, * user space TID. [9/10] */ if (pid != task_pid_vnr(pi_state->owner)) - return -EINVAL; -out_state: - atomic_inc(&pi_state->refcount); + goto out_einval; + +out_attach: + get_pi_state(pi_state); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); *ps = pi_state; return 0; + +out_einval: + ret = -EINVAL; + goto out_error; + +out_eagain: + ret = -EAGAIN; + goto out_error; + +out_efault: + ret = -EFAULT; + goto out_error; + +out_error: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } /* @@ -1095,6 +1185,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, /* * No existing pi state. First waiter. [2] + * + * This creates pi_state, we have hb->lock held, this means nothing can + * observe this state, wait_lock is irrelevant. */ pi_state = alloc_pi_state(); @@ -1119,17 +1212,18 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key, return 0; } -static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, +static int lookup_pi_state(u32 __user *uaddr, u32 uval, + struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps) { - struct futex_q *match = futex_top_waiter(hb, key); + struct futex_q *top_waiter = futex_top_waiter(hb, key); /* * If there is a waiter on that futex, validate it and * attach to the pi_state when the validation succeeds. */ - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * We are the first waiter - try to look up the owner based on @@ -1148,7 +1242,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval) if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))) return -EFAULT; - /*If user space value changed, let the caller retry */ + /* If user space value changed, let the caller retry */ return curval != uval ? -EAGAIN : 0; } @@ -1176,7 +1270,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, struct task_struct *task, int set_waiters) { u32 uval, newval, vpid = task_pid_vnr(task); - struct futex_q *match; + struct futex_q *top_waiter; int ret; /* @@ -1202,9 +1296,9 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, * Lookup existing state first. If it exists, try to attach to * its pi_state. */ - match = futex_top_waiter(hb, key); - if (match) - return attach_to_pi_state(uval, match->pi_state, ps); + top_waiter = futex_top_waiter(hb, key); + if (top_waiter) + return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps); /* * No waiter and user TID is 0. We are here because the @@ -1290,46 +1384,39 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q) * memory barrier is required here to prevent the following * store to lock_ptr from getting ahead of the plist_del. */ - smp_wmb(); - q->lock_ptr = NULL; + smp_store_release(&q->lock_ptr, NULL); } -static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, - struct futex_hash_bucket *hb) +/* + * Caller must hold a reference on @pi_state. + */ +static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state) { - struct task_struct *new_owner; - struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; + struct task_struct *new_owner; + bool deboost = false; WAKE_Q(wake_q); WAKE_Q(wake_sleeper_q); - bool deboost; int ret = 0; - if (!pi_state) - return -EINVAL; - - /* - * If current does not own the pi_state then the futex is - * inconsistent and user space fiddled with the futex value. - */ - if (pi_state->owner != current) - return -EINVAL; - - raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); + if (WARN_ON_ONCE(!new_owner)) { + /* + * As per the comment in futex_unlock_pi() this should not happen. + * + * When this happens, give up our locks and try again, giving + * the futex_lock_pi() instance time to complete, either by + * waiting on the rtmutex or removing itself from the futex + * queue. + */ + ret = -EAGAIN; + goto out_unlock; + } /* - * It is possible that the next waiter (the one that brought - * this owner to the kernel) timed out and is no longer - * waiting on the lock. - */ - if (!new_owner) - new_owner = this->task; - - /* - * We pass it to the next owner. The WAITERS bit is always - * kept enabled while there is PI state around. We cleanup the - * owner died bit, because we are the owner. + * We pass it to the next owner. The WAITERS bit is always kept + * enabled while there is PI state around. We cleanup the owner + * died bit, because we are the owner. */ newval = FUTEX_WAITERS | task_pid_vnr(new_owner); @@ -1338,6 +1425,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) { ret = -EFAULT; + } else if (curval != uval) { /* * If a unconditional UNLOCK_PI operation (user space did not @@ -1350,10 +1438,9 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, else ret = -EINVAL; } - if (ret) { - raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - return ret; - } + + if (ret) + goto out_unlock; raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); @@ -1366,24 +1453,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, pi_state->owner = new_owner; raw_spin_unlock(&new_owner->pi_lock); + /* + * We've updated the uservalue, this unlock cannot fail. + */ + deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, + &wake_sleeper_q); + +out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); - deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, - &wake_sleeper_q); - - /* - * First unlock HB so the waiter does not spin on it once he got woken - * up. Second wake up the waiter before the priority is adjusted. If we - * deboost first (and lose our higher priority), then the task might get - * scheduled away before the wake up can take place. - */ - deboost |= spin_unlock_no_deboost(&hb->lock); - wake_up_q(&wake_q); - wake_up_q_sleeper(&wake_sleeper_q); - if (deboost) + if (deboost) { + wake_up_q(&wake_q); + wake_up_q_sleeper(&wake_sleeper_q); rt_mutex_adjust_prio(current); + } - return 0; + return ret; } /* @@ -1829,7 +1914,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, * If that call succeeds then we have pi_state and an * initial refcount on it. */ - ret = lookup_pi_state(ret, hb2, &key2, &pi_state); + ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state); } switch (ret) { @@ -1912,7 +1997,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, * refcount on the pi_state and store the pointer in * the futex_q object of the waiter. */ - atomic_inc(&pi_state->refcount); + get_pi_state(pi_state); this->pi_state = pi_state; ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, this->rt_waiter, @@ -2022,20 +2107,7 @@ queue_unlock(struct futex_hash_bucket *hb) hb_waiters_dec(hb); } -/** - * queue_me() - Enqueue the futex_q on the futex_hash_bucket - * @q: The futex_q to enqueue - * @hb: The destination hash bucket - * - * The hb->lock must be held by the caller, and is released here. A call to - * queue_me() is typically paired with exactly one call to unqueue_me(). The - * exceptions involve the PI related operations, which may use unqueue_me_pi() - * or nothing if the unqueue is done as part of the wake process and the unqueue - * state is implicit in the state of woken task (see futex_wait_requeue_pi() for - * an example). - */ -static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) - __releases(&hb->lock) +static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -2052,6 +2124,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); plist_add(&q->list, &hb->chain); q->task = current; +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + __queue_me(q, hb); spin_unlock(&hb->lock); } @@ -2138,10 +2228,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, { u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; - struct task_struct *oldowner = pi_state->owner; u32 uval, uninitialized_var(curval), newval; + struct task_struct *oldowner; int ret; + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + + oldowner = pi_state->owner; /* Owner died? */ if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; @@ -2149,7 +2242,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, /* * We are here either because we stole the rtmutex from the * previous highest priority waiter or we are the highest priority - * waiter but failed to get the rtmutex the first time. + * waiter but have failed to get the rtmutex the first time. + * * We have to replace the newowner TID in the user space variable. * This must be atomic as we have to preserve the owner died bit here. * @@ -2157,17 +2251,16 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * because we can fault here. Imagine swapped out pages or a fork * that marked all the anonymous memory readonly for cow. * - * Modifying pi_state _before_ the user space value would - * leave the pi_state in an inconsistent state when we fault - * here, because we need to drop the hash bucket lock to - * handle the fault. This might be observed in the PID check - * in lookup_pi_state. + * Modifying pi_state _before_ the user space value would leave the + * pi_state in an inconsistent state when we fault here, because we + * need to drop the locks to handle the fault. This might be observed + * in the PID check in lookup_pi_state. */ retry: if (get_futex_value_locked(&uval, uaddr)) goto handle_fault; - while (1) { + for (;;) { newval = (uval & FUTEX_OWNER_DIED) | newtid; if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) @@ -2182,47 +2275,60 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, * itself. */ if (pi_state->owner != NULL) { - raw_spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - raw_spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - raw_spin_lock_irq(&newowner->pi_lock); + raw_spin_lock(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - raw_spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock(&newowner->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return 0; /* - * To handle the page fault we need to drop the hash bucket - * lock here. That gives the other task (either the highest priority - * waiter itself or the task which stole the rtmutex) the - * chance to try the fixup of the pi_state. So once we are - * back from handling the fault we need to check the pi_state - * after reacquiring the hash bucket lock and before trying to - * do another fixup. When the fixup has been done already we - * simply return. + * To handle the page fault we need to drop the locks here. That gives + * the other task (either the highest priority waiter itself or the + * task which stole the rtmutex) the chance to try the fixup of the + * pi_state. So once we are back from handling the fault we need to + * check the pi_state after reacquiring the locks and before trying to + * do another fixup. When the fixup has been done already we simply + * return. + * + * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely + * drop hb->lock since the caller owns the hb -> futex_q relation. + * Dropping the pi_mutex->wait_lock requires the state revalidate. */ handle_fault: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); spin_unlock(q->lock_ptr); ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); /* * Check if someone else fixed it for us: */ - if (pi_state->owner != oldowner) - return 0; + if (pi_state->owner != oldowner) { + ret = 0; + goto out_unlock; + } if (ret) - return ret; + goto out_unlock; goto retry; + +out_unlock: + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + return ret; } static long futex_wait_restart(struct restart_block *restart); @@ -2244,13 +2350,16 @@ static long futex_wait_restart(struct restart_block *restart); */ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) { - struct task_struct *owner; int ret = 0; if (locked) { /* * Got the lock. We might not be the anticipated owner if we * did a lock-steal - fix up the PI-state in that case: + * + * We can safely read pi_state->owner without holding wait_lock + * because we now own the rt_mutex, only the owner will attempt + * to change it. */ if (q->pi_state->owner != current) ret = fixup_pi_state_owner(uaddr, q, current); @@ -2258,43 +2367,15 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked) } /* - * Catch the rare case, where the lock was released when we were on the - * way back before we locked the hash bucket. - */ - if (q->pi_state->owner == current) { - /* - * Try to get the rt_mutex now. This might fail as some other - * task acquired the rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { - locked = 1; - goto out; - } - - /* - * pi_state is incorrect, some other task did a lock steal and - * we returned due to timeout or signal without taking the - * rt_mutex. Too late. - */ - raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock); - owner = rt_mutex_owner(&q->pi_state->pi_mutex); - if (!owner) - owner = rt_mutex_next_owner(&q->pi_state->pi_mutex); - raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock); - ret = fixup_pi_state_owner(uaddr, q, owner); - goto out; - } - - /* * Paranoia check. If we did not take the lock, then we should not be * the owner of the rt_mutex. */ - if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) { printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " "pi-state %p\n", ret, q->pi_state->pi_mutex.owner, q->pi_state->owner); + } out: return ret ? ret : locked; @@ -2518,6 +2599,8 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; + struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb; struct futex_q q = futex_q_init; int res, ret; @@ -2570,25 +2653,77 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, } } + WARN_ON(!q.pi_state); + /* * Only actually queue now that the atomic ops are done: */ - queue_me(&q, hb); + __queue_me(&q, hb); - WARN_ON(!q.pi_state); - /* - * Block on the PI mutex: - */ - if (!trylock) { - ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to); - } else { - ret = rt_mutex_trylock(&q.pi_state->pi_mutex); + if (trylock) { + ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex); /* Fixup the trylock return value: */ ret = ret ? 0 : -EWOULDBLOCK; + goto no_block; } + rt_mutex_init_waiter(&rt_waiter, false); + + /* + * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not + * hold it while doing rt_mutex_start_proxy(), because then it will + * include hb->lock in the blocking chain, even through we'll not in + * fact hold it while blocking. This will lead it to report -EDEADLK + * and BUG when futex_unlock_pi() interleaves with this. + * + * Therefore acquire wait_lock while holding hb->lock, but drop the + * latter before calling rt_mutex_start_proxy_lock(). This still fully + * serializes against futex_unlock_pi() as that does the exact same + * lock handoff sequence. + */ + raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock); + /* + * the migrate_disable() here disables migration in the in_atomic() fast + * path which is enabled again in the following spin_unlock(). We have + * one migrate_disable() pending in the slow-path which is reversed + * after the raw_spin_unlock_irq() where we leave the atomic context. + */ + migrate_disable(); + + spin_unlock(q.lock_ptr); + ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current); + raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock); + migrate_enable(); + + if (ret) { + if (ret == 1) + ret = 0; + + spin_lock(q.lock_ptr); + goto no_block; + } + + + if (unlikely(to)) + hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS); + + ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter); + spin_lock(q.lock_ptr); /* + * If we failed to acquire the lock (signal/timeout), we must + * first acquire the hb->lock before removing the lock from the + * rt_mutex waitqueue, such that we can keep the hb and rt_mutex + * wait lists consistent. + * + * In particular; it is important that futex_unlock_pi() can not + * observe this inconsistency. + */ + if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter)) + ret = 0; + +no_block: + /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. */ @@ -2604,12 +2739,19 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, * If fixup_owner() faulted and was unable to handle the fault, unlock * it and return the fault to userspace. */ - if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock */ unqueue_me_pi(&q); + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + goto out_put_key; out_unlock_put_key: @@ -2646,7 +2788,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current); union futex_key key = FUTEX_KEY_INIT; struct futex_hash_bucket *hb; - struct futex_q *match; + struct futex_q *top_waiter; int ret; retry: @@ -2670,12 +2812,48 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * all and we at least want to know if user space fiddled * with the futex value instead of blindly unlocking. */ - match = futex_top_waiter(hb, &key); - if (match) { - ret = wake_futex_pi(uaddr, uval, match, hb); + top_waiter = futex_top_waiter(hb, &key); + if (top_waiter) { + struct futex_pi_state *pi_state = top_waiter->pi_state; + + ret = -EINVAL; + if (!pi_state) + goto out_unlock; + /* - * In case of success wake_futex_pi dropped the hash - * bucket lock. + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + goto out_unlock; + + get_pi_state(pi_state); + /* + * By taking wait_lock while still holding hb->lock, we ensure + * there is no point where we hold neither; and therefore + * wake_futex_pi() must observe a state consistent with what we + * observed. + */ + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + /* + * Magic trickery for now to make the RT migrate disable + * logic happy. The following spin_unlock() happens with + * interrupts disabled so the internal migrate_enable() + * won't undo the migrate_disable() which was issued when + * locking hb->lock. + */ + migrate_disable(); + spin_unlock(&hb->lock); + + /* Drops pi_state->pi_mutex.wait_lock */ + ret = wake_futex_pi(uaddr, uval, pi_state); + + migrate_enable(); + + put_pi_state(pi_state); + + /* + * Success, we're done! No tricky corner cases. */ if (!ret) goto out_putkey; @@ -2690,7 +2868,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * setting the FUTEX_WAITERS bit. Try again. */ if (ret == -EAGAIN) { - spin_unlock(&hb->lock); put_futex_key(&key); goto retry; } @@ -2698,7 +2875,7 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * wake_futex_pi has detected invalid state. Tell user * space. */ - goto out_unlock; + goto out_putkey; } /* @@ -2708,8 +2885,10 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) * preserve the WAITERS bit not the OWNER_DIED one. We are the * owner. */ - if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) + if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) { + spin_unlock(&hb->lock); goto pi_faulted; + } /* * If uval has changed, let user space handle it. @@ -2723,7 +2902,6 @@ static int futex_unlock_pi(u32 __user *uaddr, unsigned int flags) return ret; pi_faulted: - spin_unlock(&hb->lock); put_futex_key(&key); ret = fault_in_user_writeable(uaddr); @@ -2827,6 +3005,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, u32 __user *uaddr2) { struct hrtimer_sleeper timeout, *to = NULL; + struct futex_pi_state *pi_state = NULL; struct rt_mutex_waiter rt_waiter; struct futex_hash_bucket *hb, *hb2; union futex_key key2 = FUTEX_KEY_INIT; @@ -2944,8 +3123,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, spin_lock(&hb2->lock); BUG_ON(&hb2->lock != q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); - if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) - rt_mutex_unlock(&q.pi_state->pi_mutex); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. @@ -2963,11 +3144,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, */ WARN_ON(!q.pi_state); pi_mutex = &q.pi_state->pi_mutex; - ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter); - debug_rt_mutex_free_waiter(&rt_waiter); + ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); spin_lock(&hb2->lock); BUG_ON(&hb2->lock != q.lock_ptr); + if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) + ret = 0; + + debug_rt_mutex_free_waiter(&rt_waiter); /* * Fixup the pi_state owner and possibly acquire the lock if we * haven't already. @@ -2985,13 +3169,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * the fault, unlock the rt_mutex and return the fault to * userspace. */ - if (ret && rt_mutex_owner(pi_mutex) == current) - rt_mutex_unlock(pi_mutex); + if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) { + pi_state = q.pi_state; + get_pi_state(pi_state); + } /* Unqueue and drop the lock. */ unqueue_me_pi(&q); } + if (pi_state) { + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + } + if (ret == -EINTR) { /* * We've already been requeued, but cannot restart by calling diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name) lock->name = name; } -void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task) -{ -} - -void rt_mutex_deadlock_account_unlock(struct task_struct *task) -{ -} - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -9,9 +9,6 @@ * This file contains macros used solely by rtmutex.c. Debug version. */ -extern void -rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task); -extern void rt_mutex_deadlock_account_unlock(struct task_struct *task); extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter); extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -978,8 +978,6 @@ static int __try_to_take_rt_mutex(struct rt_mutex *lock, */ rt_mutex_set_owner(lock, task); - rt_mutex_deadlock_account_lock(lock, task); - return 1; } @@ -998,19 +996,18 @@ static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, migrate_disable(); if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - rt_mutex_deadlock_account_lock(lock, current); + return; else slowfn(lock, do_mig_dis); } -static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock, - int (*slowfn)(struct rt_mutex *lock)) +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - return 0; - } - return slowfn(lock); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; + else + slowfn(lock); } #ifdef CONFIG_SMP /* @@ -1151,7 +1148,7 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, /* * Slow path to release a rt_mutex spin_lock style */ -static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) +static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) { unsigned long flags; WAKE_Q(wake_q); @@ -1161,12 +1158,10 @@ static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) debug_rt_mutex_unlock(lock); - rt_mutex_deadlock_account_unlock(current); - if (!rt_mutex_has_waiters(lock)) { lock->owner = NULL; raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - return 0; + return; } mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock); @@ -1177,33 +1172,6 @@ static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) /* Undo pi boosting.when necessary */ rt_mutex_adjust_prio(current); - return 0; -} - -static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock) -{ - unsigned long flags; - WAKE_Q(wake_q); - WAKE_Q(wake_sleeper_q); - - raw_spin_lock_irqsave(&lock->wait_lock, flags); - - debug_rt_mutex_unlock(lock); - - rt_mutex_deadlock_account_unlock(current); - - if (!rt_mutex_has_waiters(lock)) { - lock->owner = NULL; - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - return 0; - } - - mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock); - - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); - wake_up_q(&wake_q); - wake_up_q_sleeper(&wake_sleeper_q); - return 1; } void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock) @@ -1258,17 +1226,6 @@ void __lockfunc rt_spin_unlock(spinlock_t *lock) } EXPORT_SYMBOL(rt_spin_unlock); -int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock) -{ - int ret; - - /* NOTE: we always pass in '1' for nested, for simplicity */ - spin_release(&lock->dep_map, 1, _RET_IP_); - ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost); - migrate_enable(); - return ret; -} - void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) { rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); @@ -1644,6 +1601,15 @@ void rt_mutex_adjust_pi(struct task_struct *task) next_lock, NULL, task); } +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) +{ + debug_rt_mutex_init_waiter(waiter); + RB_CLEAR_NODE(&waiter->pi_tree_entry); + RB_CLEAR_NODE(&waiter->tree_entry); + waiter->task = NULL; + waiter->savestate = savestate; +} + /** * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop * @lock: the rt_mutex to take @@ -1926,8 +1892,6 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, debug_rt_mutex_unlock(lock); - rt_mutex_deadlock_account_unlock(current); - /* * We must be careful here if the fast path is enabled. If we * have no waiters queued we cannot set owner to NULL here @@ -1995,12 +1959,10 @@ rt_mutex_fastlock(struct rt_mutex *lock, int state, enum rtmutex_chainwalk chwalk, struct ww_acquire_ctx *ww_ctx)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, - ww_ctx); + + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx); } static inline int @@ -2014,21 +1976,19 @@ rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, struct ww_acquire_ctx *ww_ctx)) { if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - } else - return slowfn(lock, state, timeout, chwalk, ww_ctx); + + return slowfn(lock, state, timeout, chwalk, ww_ctx); } static inline int rt_mutex_fasttrylock(struct rt_mutex *lock, int (*slowfn)(struct rt_mutex *lock)) { - if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) { - rt_mutex_deadlock_account_lock(lock, current); + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 1; - } + return slowfn(lock); } @@ -2040,20 +2000,19 @@ rt_mutex_fastunlock(struct rt_mutex *lock, { WAKE_Q(wake_q); WAKE_Q(wake_sleeper_q); + bool deboost; - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; - } else { - bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q); + deboost = slowfn(lock, &wake_q, &wake_sleeper_q); - wake_up_q(&wake_q); - wake_up_q_sleeper(&wake_sleeper_q); + wake_up_q(&wake_q); + wake_up_q_sleeper(&wake_sleeper_q); - /* Undo pi boosting if necessary: */ - if (deboost) - rt_mutex_adjust_prio(current); - } + /* Undo pi boosting if necessary: */ + if (deboost) + rt_mutex_adjust_prio(current); } /** @@ -2087,16 +2046,11 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /* - * Futex variant with full deadlock detection. + * Futex variant, must not use fastpath. */ -int rt_mutex_timed_futex_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *timeout) +int __sched rt_mutex_futex_trylock(struct rt_mutex *lock) { - might_sleep(); - - return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_FULL_CHAINWALK, NULL, - rt_mutex_slowlock); + return rt_mutex_slowtrylock(lock); } /** @@ -2179,21 +2133,41 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) EXPORT_SYMBOL_GPL(rt_mutex_unlock); /** - * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock - * @lock: the rt_mutex to be unlocked - * - * Returns: true/false indicating whether priority adjustment is - * required or not. + * Futex variant, that since futex variants do not use the fast-path, can be + * simple and will not need to retry. */ -bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh, - struct wake_q_head *wq_sleeper) +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q, + struct wake_q_head *wq_sleeper) { - if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { - rt_mutex_deadlock_account_unlock(current); - return false; + lockdep_assert_held(&lock->wait_lock); + + debug_rt_mutex_unlock(lock); + + if (!rt_mutex_has_waiters(lock)) { + lock->owner = NULL; + return false; /* done */ + } + + mark_wakeup_next_waiter(wake_q, wq_sleeper, lock); + return true; /* deboost and wakeups */ +} + +void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) +{ + WAKE_Q(wake_q); + WAKE_Q(wake_sleeper_q); + bool deboost; + + raw_spin_lock_irq(&lock->wait_lock); + deboost = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q); + raw_spin_unlock_irq(&lock->wait_lock); + + if (deboost) { + wake_up_q(&wake_q); + wake_up_q_sleeper(&wake_sleeper_q); + rt_mutex_adjust_prio(current); } - return rt_mutex_slowunlock(lock, wqh, wq_sleeper); } /** @@ -2249,7 +2223,6 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, rt_mutex_init(lock); debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); - rt_mutex_deadlock_account_lock(lock, proxy_owner); } /** @@ -2265,34 +2238,16 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, { debug_rt_mutex_proxy_unlock(lock); rt_mutex_set_owner(lock, NULL); - rt_mutex_deadlock_account_unlock(proxy_owner); } -/** - * rt_mutex_start_proxy_lock() - Start lock acquisition for another task - * @lock: the rt_mutex to take - * @waiter: the pre-initialized rt_mutex_waiter - * @task: the task to prepare - * - * Returns: - * 0 - task blocked on lock - * 1 - acquired the lock for task, caller should wake it up - * <0 - error - * - * Special API call for FUTEX_REQUEUE_PI support. - */ -int rt_mutex_start_proxy_lock(struct rt_mutex *lock, +int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task) { int ret; - raw_spin_lock_irq(&lock->wait_lock); - - if (try_to_take_rt_mutex(lock, task, NULL)) { - raw_spin_unlock_irq(&lock->wait_lock); + if (try_to_take_rt_mutex(lock, task, NULL)) return 1; - } #ifdef CONFIG_PREEMPT_RT_FULL /* @@ -2340,14 +2295,38 @@ int rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (ret && rt_mutex_has_waiters(lock)) remove_waiter(lock, waiter); - raw_spin_unlock_irq(&lock->wait_lock); - debug_rt_mutex_print_deadlock(waiter); return ret; } /** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task) +{ + int ret; + + raw_spin_lock_irq(&lock->wait_lock); + ret = __rt_mutex_start_proxy_lock(lock, waiter, task); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** * rt_mutex_next_owner - return the next owner of the lock * * @lock: the rt lock query @@ -2368,21 +2347,23 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) } /** - * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * rt_mutex_wait_proxy_lock() - Wait for lock acquisition * @lock: the rt_mutex we were woken on * @to: the timeout, null if none. hrtimer should already have * been started. * @waiter: the pre-initialized rt_mutex_waiter * - * Complete the lock acquisition started our behalf by another thread. + * Wait for the the lock acquisition started on our behalf by + * rt_mutex_start_proxy_lock(). Upon failure, the caller must call + * rt_mutex_cleanup_proxy_lock(). * * Returns: * 0 - success * <0 - error, one of -EINTR, -ETIMEDOUT * - * Special API call for PI-futex requeue support + * Special API call for PI-futex support */ -int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, +int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, struct hrtimer_sleeper *to, struct rt_mutex_waiter *waiter) { @@ -2395,8 +2376,45 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, /* sleep on the mutex */ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); - if (unlikely(ret)) + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +} + +/** + * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition + * @lock: the rt_mutex we were woken on + * @waiter: the pre-initialized rt_mutex_waiter + * + * Attempt to clean up after a failed rt_mutex_wait_proxy_lock(). + * + * Unless we acquired the lock; we're still enqueued on the wait-list and can + * in fact still be granted ownership until we're removed. Therefore we can + * find we are in fact the owner and must disregard the + * rt_mutex_wait_proxy_lock() failure. + * + * Returns: + * true - did the cleanup, we done. + * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned, + * caller should disregards its return value. + * + * Special API call for PI-futex support + */ +bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter) +{ + bool cleanup = false; + + raw_spin_lock_irq(&lock->wait_lock); + /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); + fixup_rt_mutex_waiters(lock); + cleanup = true; + } /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might @@ -2406,7 +2424,7 @@ int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, raw_spin_unlock_irq(&lock->wait_lock); - return ret; + return cleanup; } static inline int diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -11,8 +11,6 @@ */ #define rt_mutex_deadlock_check(l) (0) -#define rt_mutex_deadlock_account_lock(m, t) do { } while (0) -#define rt_mutex_deadlock_account_unlock(l) do { } while (0) #define debug_rt_mutex_init_waiter(w) do { } while (0) #define debug_rt_mutex_free_waiter(w) do { } while (0) #define debug_rt_mutex_lock(l) do { } while (0) diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -107,16 +107,26 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate); +extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task); extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); -extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *to, - struct rt_mutex_waiter *waiter); -extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to); -extern bool rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh, - struct wake_q_head *wq_sleeper); +extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter); +extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter); + +extern int rt_mutex_futex_trylock(struct rt_mutex *l); + +extern void rt_mutex_futex_unlock(struct rt_mutex *lock); +extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wqh, + struct wake_q_head *wq_sleeper); + extern void rt_mutex_adjust_prio(struct task_struct *task); #ifdef CONFIG_DEBUG_RT_MUTEXES @@ -125,14 +135,4 @@ extern void rt_mutex_adjust_prio(struct task_struct *task); # include "rtmutex.h" #endif -static inline void -rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) -{ - debug_rt_mutex_init_waiter(waiter); - waiter->task = NULL; - waiter->savestate = savestate; - RB_CLEAR_NODE(&waiter->pi_tree_entry); - RB_CLEAR_NODE(&waiter->tree_entry); -} - #endif diff --git a/kernel/module.c b/kernel/module.c --- a/kernel/module.c +++ b/kernel/module.c @@ -677,8 +677,12 @@ bool __is_module_percpu_address(unsigned long addr, unsigned long *can_addr) void *va = (void *)addr; if (va >= start && va < start + mod->percpu_size) { - if (can_addr) + if (can_addr) { *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(mod->percpu, + get_boot_cpu_id()); + } preempt_enable(); return true; } diff --git a/localversion-rt b/localversion-rt --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt13 +-rt14 diff --git a/mm/percpu.c b/mm/percpu.c --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1295,8 +1295,11 @@ bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr) void *va = (void *)addr; if (va >= start && va < start + static_size) { - if (can_addr) + if (can_addr) { *can_addr = (unsigned long) (va - start); + *can_addr += (unsigned long) + per_cpu_ptr(base, get_boot_cpu_id()); + } return true; } } Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-rw-r--r--patches/0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch117
-rw-r--r--patches/0002-futex-Use-smp_store_release-in-mark_wake_futex.patch38
-rw-r--r--patches/0003-futex-Remove-rt_mutex_deadlock_account_.patch184
-rw-r--r--patches/0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch220
-rw-r--r--patches/0005-futex-Change-locking-rules.patch370
-rw-r--r--patches/0006-futex-Cleanup-refcounting.patch75
-rw-r--r--patches/0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch139
-rw-r--r--patches/0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch357
-rw-r--r--patches/0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch79
-rw-r--r--patches/0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch158
-rw-r--r--patches/0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch266
-rw-r--r--patches/0012-futex-Futex_unlock_pi-determinism.patch80
-rw-r--r--patches/0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch203
-rw-r--r--patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch2
-rw-r--r--patches/futex-requeue-pi-fix.patch4
-rw-r--r--patches/futex-workaround-migrate_disable-enable-in-different.patch58
-rw-r--r--patches/introduce_migrate_disable_cpu_light.patch2
-rw-r--r--patches/kernel-futex-don-t-deboost-too-early.patch161
-rw-r--r--patches/localversion.patch2
-rw-r--r--patches/lockdep-Fix-per-cpu-static-objects.patch124
-rw-r--r--patches/rt-add-rt-locks.patch206
-rw-r--r--patches/rt-locking-Reenable-migration-accross-schedule.patch12
-rw-r--r--patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch4
-rw-r--r--patches/rtmutex-add-a-first-shot-of-ww_mutex.patch64
-rw-r--r--patches/rtmutex-futex-prepare-rt.patch38
-rw-r--r--patches/rtmutex-lock-killable.patch2
-rw-r--r--patches/rtmutex-trylock-is-okay-on-RT.patch2
-rw-r--r--patches/series17
28 files changed, 2660 insertions, 324 deletions
diff --git a/patches/0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch b/patches/0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch
new file mode 100644
index 000000000000..128cf8001839
--- /dev/null
+++ b/patches/0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch
@@ -0,0 +1,117 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:48 +0100
+Subject: [PATCH] futex: Cleanup variable names for futex_top_waiter()
+
+Upstream commit 499f5aca2cdd5e958b27e2655e7e7f82524f46b1
+
+futex_top_waiter() returns the top-waiter on the pi_mutex. Assinging
+this to a variable 'match' totally obscures the code.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.554710645@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 30 +++++++++++++++---------------
+ 1 file changed, 15 insertions(+), 15 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1120,14 +1120,14 @@ static int attach_to_pi_owner(u32 uval,
+ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
+ {
+- struct futex_q *match = futex_top_waiter(hb, key);
++ struct futex_q *top_waiter = futex_top_waiter(hb, key);
+
+ /*
+ * If there is a waiter on that futex, validate it and
+ * attach to the pi_state when the validation succeeds.
+ */
+- if (match)
+- return attach_to_pi_state(uval, match->pi_state, ps);
++ if (top_waiter)
++ return attach_to_pi_state(uval, top_waiter->pi_state, ps);
+
+ /*
+ * We are the first waiter - try to look up the owner based on
+@@ -1174,7 +1174,7 @@ static int futex_lock_pi_atomic(u32 __us
+ struct task_struct *task, int set_waiters)
+ {
+ u32 uval, newval, vpid = task_pid_vnr(task);
+- struct futex_q *match;
++ struct futex_q *top_waiter;
+ int ret;
+
+ /*
+@@ -1200,9 +1200,9 @@ static int futex_lock_pi_atomic(u32 __us
+ * Lookup existing state first. If it exists, try to attach to
+ * its pi_state.
+ */
+- match = futex_top_waiter(hb, key);
+- if (match)
+- return attach_to_pi_state(uval, match->pi_state, ps);
++ top_waiter = futex_top_waiter(hb, key);
++ if (top_waiter)
++ return attach_to_pi_state(uval, top_waiter->pi_state, ps);
+
+ /*
+ * No waiter and user TID is 0. We are here because the
+@@ -1292,11 +1292,11 @@ static void mark_wake_futex(struct wake_
+ q->lock_ptr = NULL;
+ }
+
+-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
++static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter,
+ struct futex_hash_bucket *hb)
+ {
+ struct task_struct *new_owner;
+- struct futex_pi_state *pi_state = this->pi_state;
++ struct futex_pi_state *pi_state = top_waiter->pi_state;
+ u32 uninitialized_var(curval), newval;
+ WAKE_Q(wake_q);
+ bool deboost;
+@@ -1317,11 +1317,11 @@ static int wake_futex_pi(u32 __user *uad
+
+ /*
+ * It is possible that the next waiter (the one that brought
+- * this owner to the kernel) timed out and is no longer
++ * top_waiter owner to the kernel) timed out and is no longer
+ * waiting on the lock.
+ */
+ if (!new_owner)
+- new_owner = this->task;
++ new_owner = top_waiter->task;
+
+ /*
+ * We pass it to the next owner. The WAITERS bit is always
+@@ -2631,7 +2631,7 @@ static int futex_unlock_pi(u32 __user *u
+ u32 uninitialized_var(curval), uval, vpid = task_pid_vnr(current);
+ union futex_key key = FUTEX_KEY_INIT;
+ struct futex_hash_bucket *hb;
+- struct futex_q *match;
++ struct futex_q *top_waiter;
+ int ret;
+
+ retry:
+@@ -2655,9 +2655,9 @@ static int futex_unlock_pi(u32 __user *u
+ * all and we at least want to know if user space fiddled
+ * with the futex value instead of blindly unlocking.
+ */
+- match = futex_top_waiter(hb, &key);
+- if (match) {
+- ret = wake_futex_pi(uaddr, uval, match, hb);
++ top_waiter = futex_top_waiter(hb, &key);
++ if (top_waiter) {
++ ret = wake_futex_pi(uaddr, uval, top_waiter, hb);
+ /*
+ * In case of success wake_futex_pi dropped the hash
+ * bucket lock.
diff --git a/patches/0002-futex-Use-smp_store_release-in-mark_wake_futex.patch b/patches/0002-futex-Use-smp_store_release-in-mark_wake_futex.patch
new file mode 100644
index 000000000000..29c184a5184d
--- /dev/null
+++ b/patches/0002-futex-Use-smp_store_release-in-mark_wake_futex.patch
@@ -0,0 +1,38 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:49 +0100
+Subject: [PATCH] futex: Use smp_store_release() in mark_wake_futex()
+
+Upstream commit 1b367ece0d7e696cab1c8501bab282cc6a538b3f
+
+Since the futex_q can dissapear the instruction after assigning NULL,
+this really should be a RELEASE barrier. That stops loads from hitting
+dead memory too.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.604296452@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 3 +--
+ 1 file changed, 1 insertion(+), 2 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1288,8 +1288,7 @@ static void mark_wake_futex(struct wake_
+ * memory barrier is required here to prevent the following
+ * store to lock_ptr from getting ahead of the plist_del.
+ */
+- smp_wmb();
+- q->lock_ptr = NULL;
++ smp_store_release(&q->lock_ptr, NULL);
+ }
+
+ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter,
diff --git a/patches/0003-futex-Remove-rt_mutex_deadlock_account_.patch b/patches/0003-futex-Remove-rt_mutex_deadlock_account_.patch
new file mode 100644
index 000000000000..630982fb1310
--- /dev/null
+++ b/patches/0003-futex-Remove-rt_mutex_deadlock_account_.patch
@@ -0,0 +1,184 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:50 +0100
+Subject: [PATCH] futex: Remove rt_mutex_deadlock_account_*()
+
+Upstream commit fffa954fb528963c2fb7b0c0084eb77e2be7ab52
+
+These are unused and clutter up the code.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.652692478@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/locking/rtmutex-debug.c | 9 -------
+ kernel/locking/rtmutex-debug.h | 3 --
+ kernel/locking/rtmutex.c | 47 +++++++++++++++--------------------------
+ kernel/locking/rtmutex.h | 2 -
+ 4 files changed, 18 insertions(+), 43 deletions(-)
+
+--- a/kernel/locking/rtmutex-debug.c
++++ b/kernel/locking/rtmutex-debug.c
+@@ -173,12 +173,3 @@ void debug_rt_mutex_init(struct rt_mutex
+ lock->name = name;
+ }
+
+-void
+-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task)
+-{
+-}
+-
+-void rt_mutex_deadlock_account_unlock(struct task_struct *task)
+-{
+-}
+-
+--- a/kernel/locking/rtmutex-debug.h
++++ b/kernel/locking/rtmutex-debug.h
+@@ -9,9 +9,6 @@
+ * This file contains macros used solely by rtmutex.c. Debug version.
+ */
+
+-extern void
+-rt_mutex_deadlock_account_lock(struct rt_mutex *lock, struct task_struct *task);
+-extern void rt_mutex_deadlock_account_unlock(struct task_struct *task);
+ extern void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+ extern void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter);
+ extern void debug_rt_mutex_init(struct rt_mutex *lock, const char *name);
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -936,8 +936,6 @@ static int try_to_take_rt_mutex(struct r
+ */
+ rt_mutex_set_owner(lock, task);
+
+- rt_mutex_deadlock_account_lock(lock, task);
+-
+ return 1;
+ }
+
+@@ -1340,8 +1338,6 @@ static bool __sched rt_mutex_slowunlock(
+
+ debug_rt_mutex_unlock(lock);
+
+- rt_mutex_deadlock_account_unlock(current);
+-
+ /*
+ * We must be careful here if the fast path is enabled. If we
+ * have no waiters queued we cannot set owner to NULL here
+@@ -1407,11 +1403,10 @@ rt_mutex_fastlock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *timeout,
+ enum rtmutex_chainwalk chwalk))
+ {
+- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
+- rt_mutex_deadlock_account_lock(lock, current);
++ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 0;
+- } else
+- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
++
++ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
+ }
+
+ static inline int
+@@ -1423,21 +1418,19 @@ rt_mutex_timed_fastlock(struct rt_mutex
+ enum rtmutex_chainwalk chwalk))
+ {
+ if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
+- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
+- rt_mutex_deadlock_account_lock(lock, current);
++ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 0;
+- } else
+- return slowfn(lock, state, timeout, chwalk);
++
++ return slowfn(lock, state, timeout, chwalk);
+ }
+
+ static inline int
+ rt_mutex_fasttrylock(struct rt_mutex *lock,
+ int (*slowfn)(struct rt_mutex *lock))
+ {
+- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
+- rt_mutex_deadlock_account_lock(lock, current);
++ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
+ return 1;
+- }
++
+ return slowfn(lock);
+ }
+
+@@ -1447,19 +1440,18 @@ rt_mutex_fastunlock(struct rt_mutex *loc
+ struct wake_q_head *wqh))
+ {
+ WAKE_Q(wake_q);
++ bool deboost;
+
+- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
+- rt_mutex_deadlock_account_unlock(current);
++ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
++ return;
+
+- } else {
+- bool deboost = slowfn(lock, &wake_q);
++ deboost = slowfn(lock, &wake_q);
+
+- wake_up_q(&wake_q);
++ wake_up_q(&wake_q);
+
+- /* Undo pi boosting if necessary: */
+- if (deboost)
+- rt_mutex_adjust_prio(current);
+- }
++ /* Undo pi boosting if necessary: */
++ if (deboost)
++ rt_mutex_adjust_prio(current);
+ }
+
+ /**
+@@ -1570,10 +1562,9 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+ bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
+ struct wake_q_head *wqh)
+ {
+- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
+- rt_mutex_deadlock_account_unlock(current);
++ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return false;
+- }
++
+ return rt_mutex_slowunlock(lock, wqh);
+ }
+
+@@ -1631,7 +1622,6 @@ void rt_mutex_init_proxy_locked(struct r
+ __rt_mutex_init(lock, NULL);
+ debug_rt_mutex_proxy_lock(lock, proxy_owner);
+ rt_mutex_set_owner(lock, proxy_owner);
+- rt_mutex_deadlock_account_lock(lock, proxy_owner);
+ }
+
+ /**
+@@ -1647,7 +1637,6 @@ void rt_mutex_proxy_unlock(struct rt_mut
+ {
+ debug_rt_mutex_proxy_unlock(lock);
+ rt_mutex_set_owner(lock, NULL);
+- rt_mutex_deadlock_account_unlock(proxy_owner);
+ }
+
+ /**
+--- a/kernel/locking/rtmutex.h
++++ b/kernel/locking/rtmutex.h
+@@ -11,8 +11,6 @@
+ */
+
+ #define rt_mutex_deadlock_check(l) (0)
+-#define rt_mutex_deadlock_account_lock(m, t) do { } while (0)
+-#define rt_mutex_deadlock_account_unlock(l) do { } while (0)
+ #define debug_rt_mutex_init_waiter(w) do { } while (0)
+ #define debug_rt_mutex_free_waiter(w) do { } while (0)
+ #define debug_rt_mutex_lock(l) do { } while (0)
diff --git a/patches/0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch b/patches/0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch
new file mode 100644
index 000000000000..5f39524b167b
--- /dev/null
+++ b/patches/0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch
@@ -0,0 +1,220 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:51 +0100
+Subject: [PATCH] futex,rt_mutex: Provide futex specific rt_mutex API
+
+Upstream commit 5293c2efda37775346885c7e924d4ef7018ea60b
+
+Part of what makes futex_unlock_pi() intricate is that
+rt_mutex_futex_unlock() -> rt_mutex_slowunlock() can drop
+rt_mutex::wait_lock.
+
+This means it cannot rely on the atomicy of wait_lock, which would be
+preferred in order to not rely on hb->lock so much.
+
+The reason rt_mutex_slowunlock() needs to drop wait_lock is because it can
+race with the rt_mutex fastpath, however futexes have their own fast path.
+
+Since futexes already have a bunch of separate rt_mutex accessors, complete
+that set and implement a rt_mutex variant without fastpath for them.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.702962446@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 30 ++++++++++-----------
+ kernel/locking/rtmutex.c | 55 +++++++++++++++++++++++++++++-----------
+ kernel/locking/rtmutex_common.h | 9 +++++-
+ 3 files changed, 62 insertions(+), 32 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -914,7 +914,7 @@ void exit_pi_state_list(struct task_stru
+ pi_state->owner = NULL;
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+- rt_mutex_unlock(&pi_state->pi_mutex);
++ rt_mutex_futex_unlock(&pi_state->pi_mutex);
+
+ spin_unlock(&hb->lock);
+
+@@ -1362,20 +1362,18 @@ static int wake_futex_pi(u32 __user *uad
+ pi_state->owner = new_owner;
+ raw_spin_unlock(&new_owner->pi_lock);
+
+- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+-
+- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+-
+ /*
+- * First unlock HB so the waiter does not spin on it once he got woken
+- * up. Second wake up the waiter before the priority is adjusted. If we
+- * deboost first (and lose our higher priority), then the task might get
+- * scheduled away before the wake up can take place.
++ * We've updated the uservalue, this unlock cannot fail.
+ */
++ deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
++
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+- wake_up_q(&wake_q);
+- if (deboost)
++
++ if (deboost) {
++ wake_up_q(&wake_q);
+ rt_mutex_adjust_prio(current);
++ }
+
+ return 0;
+ }
+@@ -2251,7 +2249,7 @@ static int fixup_owner(u32 __user *uaddr
+ * task acquired the rt_mutex after we removed ourself from the
+ * rt_mutex waiters list.
+ */
+- if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
++ if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) {
+ locked = 1;
+ goto out;
+ }
+@@ -2566,7 +2564,7 @@ static int futex_lock_pi(u32 __user *uad
+ if (!trylock) {
+ ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+ } else {
+- ret = rt_mutex_trylock(&q.pi_state->pi_mutex);
++ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
+ }
+@@ -2589,7 +2587,7 @@ static int futex_lock_pi(u32 __user *uad
+ * it and return the fault to userspace.
+ */
+ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
+- rt_mutex_unlock(&q.pi_state->pi_mutex);
++ rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+@@ -2896,7 +2894,7 @@ static int futex_wait_requeue_pi(u32 __u
+ spin_lock(q.lock_ptr);
+ ret = fixup_pi_state_owner(uaddr2, &q, current);
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
+- rt_mutex_unlock(&q.pi_state->pi_mutex);
++ rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
+ /*
+ * Drop the reference to the pi state which
+ * the requeue_pi() code acquired for us.
+@@ -2936,7 +2934,7 @@ static int futex_wait_requeue_pi(u32 __u
+ * userspace.
+ */
+ if (ret && rt_mutex_owner(pi_mutex) == current)
+- rt_mutex_unlock(pi_mutex);
++ rt_mutex_futex_unlock(pi_mutex);
+
+ /* Unqueue and drop the lock. */
+ unqueue_me_pi(&q);
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1486,15 +1486,23 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock_interrup
+
+ /*
+ * Futex variant with full deadlock detection.
++ * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock().
+ */
+-int rt_mutex_timed_futex_lock(struct rt_mutex *lock,
++int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *timeout)
+ {
+ might_sleep();
+
+- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
+- RT_MUTEX_FULL_CHAINWALK,
+- rt_mutex_slowlock);
++ return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE,
++ timeout, RT_MUTEX_FULL_CHAINWALK);
++}
++
++/*
++ * Futex variant, must not use fastpath.
++ */
++int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
++{
++ return rt_mutex_slowtrylock(lock);
+ }
+
+ /**
+@@ -1553,19 +1561,38 @@ void __sched rt_mutex_unlock(struct rt_m
+ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+
+ /**
+- * rt_mutex_futex_unlock - Futex variant of rt_mutex_unlock
+- * @lock: the rt_mutex to be unlocked
+- *
+- * Returns: true/false indicating whether priority adjustment is
+- * required or not.
++ * Futex variant, that since futex variants do not use the fast-path, can be
++ * simple and will not need to retry.
+ */
+-bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
+- struct wake_q_head *wqh)
++bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
++ struct wake_q_head *wake_q)
++{
++ lockdep_assert_held(&lock->wait_lock);
++
++ debug_rt_mutex_unlock(lock);
++
++ if (!rt_mutex_has_waiters(lock)) {
++ lock->owner = NULL;
++ return false; /* done */
++ }
++
++ mark_wakeup_next_waiter(wake_q, lock);
++ return true; /* deboost and wakeups */
++}
++
++void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+ {
+- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+- return false;
++ WAKE_Q(wake_q);
++ bool deboost;
+
+- return rt_mutex_slowunlock(lock, wqh);
++ raw_spin_lock_irq(&lock->wait_lock);
++ deboost = __rt_mutex_futex_unlock(lock, &wake_q);
++ raw_spin_unlock_irq(&lock->wait_lock);
++
++ if (deboost) {
++ wake_up_q(&wake_q);
++ rt_mutex_adjust_prio(current);
++ }
+ }
+
+ /**
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -109,9 +109,14 @@ extern int rt_mutex_start_proxy_lock(str
+ extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter);
++
+ extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+-extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
+- struct wake_q_head *wqh);
++extern int rt_mutex_futex_trylock(struct rt_mutex *l);
++
++extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
++extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
++ struct wake_q_head *wqh);
++
+ extern void rt_mutex_adjust_prio(struct task_struct *task);
+
+ #ifdef CONFIG_DEBUG_RT_MUTEXES
diff --git a/patches/0005-futex-Change-locking-rules.patch b/patches/0005-futex-Change-locking-rules.patch
new file mode 100644
index 000000000000..a6a3f0ad08fe
--- /dev/null
+++ b/patches/0005-futex-Change-locking-rules.patch
@@ -0,0 +1,370 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:52 +0100
+Subject: [PATCH] futex: Change locking rules
+
+Upstream commit 734009e96d1983ad739e5b656e03430b3660c913
+
+Currently futex-pi relies on hb->lock to serialize everything. But hb->lock
+creates another set of problems, especially priority inversions on RT where
+hb->lock becomes a rt_mutex itself.
+
+The rt_mutex::wait_lock is the most obvious protection for keeping the
+futex user space value and the kernel internal pi_state in sync.
+
+Rework and document the locking so rt_mutex::wait_lock is held accross all
+operations which modify the user space value and the pi state.
+
+This allows to invoke rt_mutex_unlock() (including deboost) without holding
+hb->lock as a next step.
+
+Nothing yet relies on the new locking rules.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.751993333@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 165 +++++++++++++++++++++++++++++++++++++++++++++------------
+ 1 file changed, 132 insertions(+), 33 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -971,6 +971,39 @@ void exit_pi_state_list(struct task_stru
+ *
+ * [10] There is no transient state which leaves owner and user space
+ * TID out of sync.
++ *
++ *
++ * Serialization and lifetime rules:
++ *
++ * hb->lock:
++ *
++ * hb -> futex_q, relation
++ * futex_q -> pi_state, relation
++ *
++ * (cannot be raw because hb can contain arbitrary amount
++ * of futex_q's)
++ *
++ * pi_mutex->wait_lock:
++ *
++ * {uval, pi_state}
++ *
++ * (and pi_mutex 'obviously')
++ *
++ * p->pi_lock:
++ *
++ * p->pi_state_list -> pi_state->list, relation
++ *
++ * pi_state->refcount:
++ *
++ * pi_state lifetime
++ *
++ *
++ * Lock order:
++ *
++ * hb->lock
++ * pi_mutex->wait_lock
++ * p->pi_lock
++ *
+ */
+
+ /*
+@@ -978,10 +1011,12 @@ void exit_pi_state_list(struct task_stru
+ * the pi_state against the user space value. If correct, attach to
+ * it.
+ */
+-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
++static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
++ struct futex_pi_state *pi_state,
+ struct futex_pi_state **ps)
+ {
+ pid_t pid = uval & FUTEX_TID_MASK;
++ int ret, uval2;
+
+ /*
+ * Userspace might have messed up non-PI and PI futexes [3]
+@@ -989,9 +1024,34 @@ static int attach_to_pi_state(u32 uval,
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
++ /*
++ * We get here with hb->lock held, and having found a
++ * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
++ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
++ * which in turn means that futex_lock_pi() still has a reference on
++ * our pi_state.
++ */
+ WARN_ON(!atomic_read(&pi_state->refcount));
+
+ /*
++ * Now that we have a pi_state, we can acquire wait_lock
++ * and do the state validation.
++ */
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++
++ /*
++ * Since {uval, pi_state} is serialized by wait_lock, and our current
++ * uval was read without holding it, it can have changed. Verify it
++ * still is what we expect it to be, otherwise retry the entire
++ * operation.
++ */
++ if (get_futex_value_locked(&uval2, uaddr))
++ goto out_efault;
++
++ if (uval != uval2)
++ goto out_eagain;
++
++ /*
+ * Handle the owner died case:
+ */
+ if (uval & FUTEX_OWNER_DIED) {
+@@ -1006,11 +1066,11 @@ static int attach_to_pi_state(u32 uval,
+ * is not 0. Inconsistent state. [5]
+ */
+ if (pid)
+- return -EINVAL;
++ goto out_einval;
+ /*
+ * Take a ref on the state and return success. [4]
+ */
+- goto out_state;
++ goto out_attach;
+ }
+
+ /*
+@@ -1022,14 +1082,14 @@ static int attach_to_pi_state(u32 uval,
+ * Take a ref on the state and return success. [6]
+ */
+ if (!pid)
+- goto out_state;
++ goto out_attach;
+ } else {
+ /*
+ * If the owner died bit is not set, then the pi_state
+ * must have an owner. [7]
+ */
+ if (!pi_state->owner)
+- return -EINVAL;
++ goto out_einval;
+ }
+
+ /*
+@@ -1038,11 +1098,29 @@ static int attach_to_pi_state(u32 uval,
+ * user space TID. [9/10]
+ */
+ if (pid != task_pid_vnr(pi_state->owner))
+- return -EINVAL;
+-out_state:
++ goto out_einval;
++
++out_attach:
+ atomic_inc(&pi_state->refcount);
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ *ps = pi_state;
+ return 0;
++
++out_einval:
++ ret = -EINVAL;
++ goto out_error;
++
++out_eagain:
++ ret = -EAGAIN;
++ goto out_error;
++
++out_efault:
++ ret = -EFAULT;
++ goto out_error;
++
++out_error:
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++ return ret;
+ }
+
+ /*
+@@ -1093,6 +1171,9 @@ static int attach_to_pi_owner(u32 uval,
+
+ /*
+ * No existing pi state. First waiter. [2]
++ *
++ * This creates pi_state, we have hb->lock held, this means nothing can
++ * observe this state, wait_lock is irrelevant.
+ */
+ pi_state = alloc_pi_state();
+
+@@ -1117,7 +1198,8 @@ static int attach_to_pi_owner(u32 uval,
+ return 0;
+ }
+
+-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
++static int lookup_pi_state(u32 __user *uaddr, u32 uval,
++ struct futex_hash_bucket *hb,
+ union futex_key *key, struct futex_pi_state **ps)
+ {
+ struct futex_q *top_waiter = futex_top_waiter(hb, key);
+@@ -1127,7 +1209,7 @@ static int lookup_pi_state(u32 uval, str
+ * attach to the pi_state when the validation succeeds.
+ */
+ if (top_waiter)
+- return attach_to_pi_state(uval, top_waiter->pi_state, ps);
++ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
+
+ /*
+ * We are the first waiter - try to look up the owner based on
+@@ -1146,7 +1228,7 @@ static int lock_pi_update_atomic(u32 __u
+ if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
+ return -EFAULT;
+
+- /*If user space value changed, let the caller retry */
++ /* If user space value changed, let the caller retry */
+ return curval != uval ? -EAGAIN : 0;
+ }
+
+@@ -1202,7 +1284,7 @@ static int futex_lock_pi_atomic(u32 __us
+ */
+ top_waiter = futex_top_waiter(hb, key);
+ if (top_waiter)
+- return attach_to_pi_state(uval, top_waiter->pi_state, ps);
++ return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
+
+ /*
+ * No waiter and user TID is 0. We are here because the
+@@ -1334,6 +1416,7 @@ static int wake_futex_pi(u32 __user *uad
+
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
+ ret = -EFAULT;
++
+ } else if (curval != uval) {
+ /*
+ * If a unconditional UNLOCK_PI operation (user space did not
+@@ -1346,6 +1429,7 @@ static int wake_futex_pi(u32 __user *uad
+ else
+ ret = -EINVAL;
+ }
++
+ if (ret) {
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ return ret;
+@@ -1821,7 +1905,7 @@ static int futex_requeue(u32 __user *uad
+ * If that call succeeds then we have pi_state and an
+ * initial refcount on it.
+ */
+- ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
++ ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
+ }
+
+ switch (ret) {
+@@ -2120,10 +2204,13 @@ static int fixup_pi_state_owner(u32 __us
+ {
+ u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
+ struct futex_pi_state *pi_state = q->pi_state;
+- struct task_struct *oldowner = pi_state->owner;
+ u32 uval, uninitialized_var(curval), newval;
++ struct task_struct *oldowner;
+ int ret;
+
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++
++ oldowner = pi_state->owner;
+ /* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+@@ -2139,11 +2226,10 @@ static int fixup_pi_state_owner(u32 __us
+ * because we can fault here. Imagine swapped out pages or a fork
+ * that marked all the anonymous memory readonly for cow.
+ *
+- * Modifying pi_state _before_ the user space value would
+- * leave the pi_state in an inconsistent state when we fault
+- * here, because we need to drop the hash bucket lock to
+- * handle the fault. This might be observed in the PID check
+- * in lookup_pi_state.
++ * Modifying pi_state _before_ the user space value would leave the
++ * pi_state in an inconsistent state when we fault here, because we
++ * need to drop the locks to handle the fault. This might be observed
++ * in the PID check in lookup_pi_state.
+ */
+ retry:
+ if (get_futex_value_locked(&uval, uaddr))
+@@ -2164,47 +2250,60 @@ static int fixup_pi_state_owner(u32 __us
+ * itself.
+ */
+ if (pi_state->owner != NULL) {
+- raw_spin_lock_irq(&pi_state->owner->pi_lock);
++ raw_spin_lock(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
++ raw_spin_unlock(&pi_state->owner->pi_lock);
+ }
+
+ pi_state->owner = newowner;
+
+- raw_spin_lock_irq(&newowner->pi_lock);
++ raw_spin_lock(&newowner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
+ list_add(&pi_state->list, &newowner->pi_state_list);
+- raw_spin_unlock_irq(&newowner->pi_lock);
++ raw_spin_unlock(&newowner->pi_lock);
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++
+ return 0;
+
+ /*
+- * To handle the page fault we need to drop the hash bucket
+- * lock here. That gives the other task (either the highest priority
+- * waiter itself or the task which stole the rtmutex) the
+- * chance to try the fixup of the pi_state. So once we are
+- * back from handling the fault we need to check the pi_state
+- * after reacquiring the hash bucket lock and before trying to
+- * do another fixup. When the fixup has been done already we
+- * simply return.
++ * To handle the page fault we need to drop the locks here. That gives
++ * the other task (either the highest priority waiter itself or the
++ * task which stole the rtmutex) the chance to try the fixup of the
++ * pi_state. So once we are back from handling the fault we need to
++ * check the pi_state after reacquiring the locks and before trying to
++ * do another fixup. When the fixup has been done already we simply
++ * return.
++ *
++ * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
++ * drop hb->lock since the caller owns the hb -> futex_q relation.
++ * Dropping the pi_mutex->wait_lock requires the state revalidate.
+ */
+ handle_fault:
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(q->lock_ptr);
+
+ ret = fault_in_user_writeable(uaddr);
+
+ spin_lock(q->lock_ptr);
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+ /*
+ * Check if someone else fixed it for us:
+ */
+- if (pi_state->owner != oldowner)
+- return 0;
++ if (pi_state->owner != oldowner) {
++ ret = 0;
++ goto out_unlock;
++ }
+
+ if (ret)
+- return ret;
++ goto out_unlock;
+
+ goto retry;
++
++out_unlock:
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++ return ret;
+ }
+
+ static long futex_wait_restart(struct restart_block *restart);
diff --git a/patches/0006-futex-Cleanup-refcounting.patch b/patches/0006-futex-Cleanup-refcounting.patch
new file mode 100644
index 000000000000..e1e7b05733c8
--- /dev/null
+++ b/patches/0006-futex-Cleanup-refcounting.patch
@@ -0,0 +1,75 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:53 +0100
+Subject: [PATCH] futex: Cleanup refcounting
+
+Upstream commit bf92cf3a5100f5a0d5f9834787b130159397cb22
+
+Add a put_pit_state() as counterpart for get_pi_state() so the refcounting
+becomes consistent.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.801778516@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 13 +++++++++----
+ 1 file changed, 9 insertions(+), 4 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -800,7 +800,7 @@ static int refill_pi_state_cache(void)
+ return 0;
+ }
+
+-static struct futex_pi_state * alloc_pi_state(void)
++static struct futex_pi_state *alloc_pi_state(void)
+ {
+ struct futex_pi_state *pi_state = current->pi_state_cache;
+
+@@ -810,6 +810,11 @@ static struct futex_pi_state * alloc_pi_
+ return pi_state;
+ }
+
++static void get_pi_state(struct futex_pi_state *pi_state)
++{
++ WARN_ON_ONCE(!atomic_inc_not_zero(&pi_state->refcount));
++}
++
+ /*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+@@ -854,7 +859,7 @@ static void put_pi_state(struct futex_pi
+ * Look up the task based on what TID userspace gave us.
+ * We dont trust it.
+ */
+-static struct task_struct * futex_find_get_task(pid_t pid)
++static struct task_struct *futex_find_get_task(pid_t pid)
+ {
+ struct task_struct *p;
+
+@@ -1101,7 +1106,7 @@ static int attach_to_pi_state(u32 __user
+ goto out_einval;
+
+ out_attach:
+- atomic_inc(&pi_state->refcount);
++ get_pi_state(pi_state);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+ *ps = pi_state;
+ return 0;
+@@ -1988,7 +1993,7 @@ static int futex_requeue(u32 __user *uad
+ * refcount on the pi_state and store the pointer in
+ * the futex_q object of the waiter.
+ */
+- atomic_inc(&pi_state->refcount);
++ get_pi_state(pi_state);
+ this->pi_state = pi_state;
+ ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+ this->rt_waiter,
diff --git a/patches/0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch b/patches/0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch
new file mode 100644
index 000000000000..c07c8076e29b
--- /dev/null
+++ b/patches/0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch
@@ -0,0 +1,139 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:54 +0100
+Subject: [PATCH] futex: Rework inconsistent rt_mutex/futex_q state
+
+Upstream commit 73d786bd043ebc855f349c81ea805f6b11cbf2aa
+
+There is a weird state in the futex_unlock_pi() path when it interleaves
+with a concurrent futex_lock_pi() at the point where it drops hb->lock.
+
+In this case, it can happen that the rt_mutex wait_list and the futex_q
+disagree on pending waiters, in particular rt_mutex will find no pending
+waiters where futex_q thinks there are. In this case the rt_mutex unlock
+code cannot assign an owner.
+
+The futex side fixup code has to cleanup the inconsistencies with quite a
+bunch of interesting corner cases.
+
+Simplify all this by changing wake_futex_pi() to return -EAGAIN when this
+situation occurs. This then gives the futex_lock_pi() code the opportunity
+to continue and the retried futex_unlock_pi() will now observe a coherent
+state.
+
+The only problem is that this breaks RT timeliness guarantees. That
+is, consider the following scenario:
+
+ T1 and T2 are both pinned to CPU0. prio(T2) > prio(T1)
+
+ CPU0
+
+ T1
+ lock_pi()
+ queue_me() <- Waiter is visible
+
+ preemption
+
+ T2
+ unlock_pi()
+ loops with -EAGAIN forever
+
+Which is undesirable for PI primitives. Future patches will rectify
+this.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.850383690@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 50 ++++++++++++++------------------------------------
+ 1 file changed, 14 insertions(+), 36 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1402,12 +1402,19 @@ static int wake_futex_pi(u32 __user *uad
+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+
+ /*
+- * It is possible that the next waiter (the one that brought
+- * top_waiter owner to the kernel) timed out and is no longer
+- * waiting on the lock.
++ * When we interleave with futex_lock_pi() where it does
++ * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter,
++ * but the rt_mutex's wait_list can be empty (either still, or again,
++ * depending on which side we land).
++ *
++ * When this happens, give up our locks and try again, giving the
++ * futex_lock_pi() instance time to complete, either by waiting on the
++ * rtmutex or removing itself from the futex queue.
+ */
+- if (!new_owner)
+- new_owner = top_waiter->task;
++ if (!new_owner) {
++ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
++ return -EAGAIN;
++ }
+
+ /*
+ * We pass it to the next owner. The WAITERS bit is always
+@@ -2330,7 +2337,6 @@ static long futex_wait_restart(struct re
+ */
+ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
+ {
+- struct task_struct *owner;
+ int ret = 0;
+
+ if (locked) {
+@@ -2344,43 +2350,15 @@ static int fixup_owner(u32 __user *uaddr
+ }
+
+ /*
+- * Catch the rare case, where the lock was released when we were on the
+- * way back before we locked the hash bucket.
+- */
+- if (q->pi_state->owner == current) {
+- /*
+- * Try to get the rt_mutex now. This might fail as some other
+- * task acquired the rt_mutex after we removed ourself from the
+- * rt_mutex waiters list.
+- */
+- if (rt_mutex_futex_trylock(&q->pi_state->pi_mutex)) {
+- locked = 1;
+- goto out;
+- }
+-
+- /*
+- * pi_state is incorrect, some other task did a lock steal and
+- * we returned due to timeout or signal without taking the
+- * rt_mutex. Too late.
+- */
+- raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
+- owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+- if (!owner)
+- owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
+- raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
+- ret = fixup_pi_state_owner(uaddr, q, owner);
+- goto out;
+- }
+-
+- /*
+ * Paranoia check. If we did not take the lock, then we should not be
+ * the owner of the rt_mutex.
+ */
+- if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
++ if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) {
+ printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+ "pi-state %p\n", ret,
+ q->pi_state->pi_mutex.owner,
+ q->pi_state->owner);
++ }
+
+ out:
+ return ret ? ret : locked;
diff --git a/patches/0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch b/patches/0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch
new file mode 100644
index 000000000000..53d4c2257a8a
--- /dev/null
+++ b/patches/0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch
@@ -0,0 +1,357 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:55 +0100
+Subject: [PATCH] futex: Pull rt_mutex_futex_unlock() out from under hb->lock
+
+Upstream commit 16ffa12d742534d4ff73e8b3a4e81c1de39196f0
+
+There's a number of 'interesting' problems, all caused by holding
+hb->lock while doing the rt_mutex_unlock() equivalient.
+
+Notably:
+
+ - a PI inversion on hb->lock; and,
+
+ - a SCHED_DEADLINE crash because of pointer instability.
+
+The previous changes:
+
+ - changed the locking rules to cover {uval,pi_state} with wait_lock.
+
+ - allow to do rt_mutex_futex_unlock() without dropping wait_lock; which in
+ turn allows to rely on wait_lock atomicity completely.
+
+ - simplified the waiter conundrum.
+
+It's now sufficient to hold rtmutex::wait_lock and a reference on the
+pi_state to protect the state consistency, so hb->lock can be dropped
+before calling rt_mutex_futex_unlock().
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.900002056@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 154 +++++++++++++++++++++++++++++++++++++--------------------
+ 1 file changed, 100 insertions(+), 54 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -919,10 +919,12 @@ void exit_pi_state_list(struct task_stru
+ pi_state->owner = NULL;
+ raw_spin_unlock_irq(&curr->pi_lock);
+
+- rt_mutex_futex_unlock(&pi_state->pi_mutex);
+-
++ get_pi_state(pi_state);
+ spin_unlock(&hb->lock);
+
++ rt_mutex_futex_unlock(&pi_state->pi_mutex);
++ put_pi_state(pi_state);
++
+ raw_spin_lock_irq(&curr->pi_lock);
+ }
+ raw_spin_unlock_irq(&curr->pi_lock);
+@@ -1035,6 +1037,11 @@ static int attach_to_pi_state(u32 __user
+ * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+ * which in turn means that futex_lock_pi() still has a reference on
+ * our pi_state.
++ *
++ * The waiter holding a reference on @pi_state also protects against
++ * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
++ * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
++ * free pi_state before we can take a reference ourselves.
+ */
+ WARN_ON(!atomic_read(&pi_state->refcount));
+
+@@ -1378,48 +1385,40 @@ static void mark_wake_futex(struct wake_
+ smp_store_release(&q->lock_ptr, NULL);
+ }
+
+-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter,
+- struct futex_hash_bucket *hb)
++/*
++ * Caller must hold a reference on @pi_state.
++ */
++static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
+ {
+- struct task_struct *new_owner;
+- struct futex_pi_state *pi_state = top_waiter->pi_state;
+ u32 uninitialized_var(curval), newval;
++ struct task_struct *new_owner;
++ bool deboost = false;
+ WAKE_Q(wake_q);
+- bool deboost;
+ int ret = 0;
+
+- if (!pi_state)
+- return -EINVAL;
+-
+- /*
+- * If current does not own the pi_state then the futex is
+- * inconsistent and user space fiddled with the futex value.
+- */
+- if (pi_state->owner != current)
+- return -EINVAL;
+-
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+-
+- /*
+- * When we interleave with futex_lock_pi() where it does
+- * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter,
+- * but the rt_mutex's wait_list can be empty (either still, or again,
+- * depending on which side we land).
+- *
+- * When this happens, give up our locks and try again, giving the
+- * futex_lock_pi() instance time to complete, either by waiting on the
+- * rtmutex or removing itself from the futex queue.
+- */
+ if (!new_owner) {
+- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+- return -EAGAIN;
++ /*
++ * Since we held neither hb->lock nor wait_lock when coming
++ * into this function, we could have raced with futex_lock_pi()
++ * such that we might observe @this futex_q waiter, but the
++ * rt_mutex's wait_list can be empty (either still, or again,
++ * depending on which side we land).
++ *
++ * When this happens, give up our locks and try again, giving
++ * the futex_lock_pi() instance time to complete, either by
++ * waiting on the rtmutex or removing itself from the futex
++ * queue.
++ */
++ ret = -EAGAIN;
++ goto out_unlock;
+ }
+
+ /*
+- * We pass it to the next owner. The WAITERS bit is always
+- * kept enabled while there is PI state around. We cleanup the
+- * owner died bit, because we are the owner.
++ * We pass it to the next owner. The WAITERS bit is always kept
++ * enabled while there is PI state around. We cleanup the owner
++ * died bit, because we are the owner.
+ */
+ newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
+
+@@ -1442,10 +1441,8 @@ static int wake_futex_pi(u32 __user *uad
+ ret = -EINVAL;
+ }
+
+- if (ret) {
+- raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+- return ret;
+- }
++ if (ret)
++ goto out_unlock;
+
+ raw_spin_lock(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+@@ -1463,15 +1460,15 @@ static int wake_futex_pi(u32 __user *uad
+ */
+ deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
+
++out_unlock:
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+- spin_unlock(&hb->lock);
+
+ if (deboost) {
+ wake_up_q(&wake_q);
+ rt_mutex_adjust_prio(current);
+ }
+
+- return 0;
++ return ret;
+ }
+
+ /*
+@@ -2230,7 +2227,8 @@ static int fixup_pi_state_owner(u32 __us
+ /*
+ * We are here either because we stole the rtmutex from the
+ * previous highest priority waiter or we are the highest priority
+- * waiter but failed to get the rtmutex the first time.
++ * waiter but have failed to get the rtmutex the first time.
++ *
+ * We have to replace the newowner TID in the user space variable.
+ * This must be atomic as we have to preserve the owner died bit here.
+ *
+@@ -2247,7 +2245,7 @@ static int fixup_pi_state_owner(u32 __us
+ if (get_futex_value_locked(&uval, uaddr))
+ goto handle_fault;
+
+- while (1) {
++ for (;;) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
+@@ -2343,6 +2341,10 @@ static int fixup_owner(u32 __user *uaddr
+ /*
+ * Got the lock. We might not be the anticipated owner if we
+ * did a lock-steal - fix up the PI-state in that case:
++ *
++ * We can safely read pi_state->owner without holding wait_lock
++ * because we now own the rt_mutex, only the owner will attempt
++ * to change it.
+ */
+ if (q->pi_state->owner != current)
+ ret = fixup_pi_state_owner(uaddr, q, current);
+@@ -2582,6 +2584,7 @@ static int futex_lock_pi(u32 __user *uad
+ ktime_t *time, int trylock)
+ {
+ struct hrtimer_sleeper timeout, *to = NULL;
++ struct futex_pi_state *pi_state = NULL;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int res, ret;
+@@ -2668,12 +2671,19 @@ static int futex_lock_pi(u32 __user *uad
+ * If fixup_owner() faulted and was unable to handle the fault, unlock
+ * it and return the fault to userspace.
+ */
+- if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
+- rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
++ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
++ pi_state = q.pi_state;
++ get_pi_state(pi_state);
++ }
+
+ /* Unqueue and drop the lock */
+ unqueue_me_pi(&q);
+
++ if (pi_state) {
++ rt_mutex_futex_unlock(&pi_state->pi_mutex);
++ put_pi_state(pi_state);
++ }
++
+ goto out_put_key;
+
+ out_unlock_put_key:
+@@ -2736,10 +2746,36 @@ static int futex_unlock_pi(u32 __user *u
+ */
+ top_waiter = futex_top_waiter(hb, &key);
+ if (top_waiter) {
+- ret = wake_futex_pi(uaddr, uval, top_waiter, hb);
++ struct futex_pi_state *pi_state = top_waiter->pi_state;
++
++ ret = -EINVAL;
++ if (!pi_state)
++ goto out_unlock;
++
++ /*
++ * If current does not own the pi_state then the futex is
++ * inconsistent and user space fiddled with the futex value.
++ */
++ if (pi_state->owner != current)
++ goto out_unlock;
++
++ /*
++ * Grab a reference on the pi_state and drop hb->lock.
++ *
++ * The reference ensures pi_state lives, dropping the hb->lock
++ * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to
++ * close the races against futex_lock_pi(), but in case of
++ * _any_ fail we'll abort and retry the whole deal.
++ */
++ get_pi_state(pi_state);
++ spin_unlock(&hb->lock);
++
++ ret = wake_futex_pi(uaddr, uval, pi_state);
++
++ put_pi_state(pi_state);
++
+ /*
+- * In case of success wake_futex_pi dropped the hash
+- * bucket lock.
++ * Success, we're done! No tricky corner cases.
+ */
+ if (!ret)
+ goto out_putkey;
+@@ -2754,7 +2790,6 @@ static int futex_unlock_pi(u32 __user *u
+ * setting the FUTEX_WAITERS bit. Try again.
+ */
+ if (ret == -EAGAIN) {
+- spin_unlock(&hb->lock);
+ put_futex_key(&key);
+ goto retry;
+ }
+@@ -2762,7 +2797,7 @@ static int futex_unlock_pi(u32 __user *u
+ * wake_futex_pi has detected invalid state. Tell user
+ * space.
+ */
+- goto out_unlock;
++ goto out_putkey;
+ }
+
+ /*
+@@ -2772,8 +2807,10 @@ static int futex_unlock_pi(u32 __user *u
+ * preserve the WAITERS bit not the OWNER_DIED one. We are the
+ * owner.
+ */
+- if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
++ if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
++ spin_unlock(&hb->lock);
+ goto pi_faulted;
++ }
+
+ /*
+ * If uval has changed, let user space handle it.
+@@ -2787,7 +2824,6 @@ static int futex_unlock_pi(u32 __user *u
+ return ret;
+
+ pi_faulted:
+- spin_unlock(&hb->lock);
+ put_futex_key(&key);
+
+ ret = fault_in_user_writeable(uaddr);
+@@ -2891,6 +2927,7 @@ static int futex_wait_requeue_pi(u32 __u
+ u32 __user *uaddr2)
+ {
+ struct hrtimer_sleeper timeout, *to = NULL;
++ struct futex_pi_state *pi_state = NULL;
+ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ union futex_key key2 = FUTEX_KEY_INIT;
+@@ -2975,8 +3012,10 @@ static int futex_wait_requeue_pi(u32 __u
+ if (q.pi_state && (q.pi_state->owner != current)) {
+ spin_lock(q.lock_ptr);
+ ret = fixup_pi_state_owner(uaddr2, &q, current);
+- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
+- rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
++ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
++ pi_state = q.pi_state;
++ get_pi_state(pi_state);
++ }
+ /*
+ * Drop the reference to the pi state which
+ * the requeue_pi() code acquired for us.
+@@ -3015,13 +3054,20 @@ static int futex_wait_requeue_pi(u32 __u
+ * the fault, unlock the rt_mutex and return the fault to
+ * userspace.
+ */
+- if (ret && rt_mutex_owner(pi_mutex) == current)
+- rt_mutex_futex_unlock(pi_mutex);
++ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
++ pi_state = q.pi_state;
++ get_pi_state(pi_state);
++ }
+
+ /* Unqueue and drop the lock. */
+ unqueue_me_pi(&q);
+ }
+
++ if (pi_state) {
++ rt_mutex_futex_unlock(&pi_state->pi_mutex);
++ put_pi_state(pi_state);
++ }
++
+ if (ret == -EINTR) {
+ /*
+ * We've already been requeued, but cannot restart by calling
diff --git a/patches/0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch b/patches/0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch
new file mode 100644
index 000000000000..10b1039f290e
--- /dev/null
+++ b/patches/0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch
@@ -0,0 +1,79 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:56 +0100
+Subject: [PATCH] futex,rt_mutex: Introduce rt_mutex_init_waiter()
+
+Upstream commit 50809358dd7199aa7ce232f6877dd09ec30ef374
+
+Since there's already two copies of this code, introduce a helper now
+before adding a third one.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104151.950039479@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 5 +----
+ kernel/locking/rtmutex.c | 12 +++++++++---
+ kernel/locking/rtmutex_common.h | 1 +
+ 3 files changed, 11 insertions(+), 7 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2954,10 +2954,7 @@ static int futex_wait_requeue_pi(u32 __u
+ * The waiter is allocated on our stack, manipulated by the requeue
+ * code while we sleep on uaddr.
+ */
+- debug_rt_mutex_init_waiter(&rt_waiter);
+- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
+- RB_CLEAR_NODE(&rt_waiter.tree_entry);
+- rt_waiter.task = NULL;
++ rt_mutex_init_waiter(&rt_waiter);
+
+ ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
+ if (unlikely(ret != 0))
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1151,6 +1151,14 @@ void rt_mutex_adjust_pi(struct task_stru
+ next_lock, NULL, task);
+ }
+
++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
++{
++ debug_rt_mutex_init_waiter(waiter);
++ RB_CLEAR_NODE(&waiter->pi_tree_entry);
++ RB_CLEAR_NODE(&waiter->tree_entry);
++ waiter->task = NULL;
++}
++
+ /**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock: the rt_mutex to take
+@@ -1233,9 +1241,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+ unsigned long flags;
+ int ret = 0;
+
+- debug_rt_mutex_init_waiter(&waiter);
+- RB_CLEAR_NODE(&waiter.pi_tree_entry);
+- RB_CLEAR_NODE(&waiter.tree_entry);
++ rt_mutex_init_waiter(&waiter);
+
+ /*
+ * Technically we could use raw_spin_[un]lock_irq() here, but this can
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -103,6 +103,7 @@ extern void rt_mutex_init_proxy_locked(s
+ struct task_struct *proxy_owner);
+ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
+ extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
diff --git a/patches/0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch b/patches/0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch
new file mode 100644
index 000000000000..10af5d18f2ea
--- /dev/null
+++ b/patches/0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch
@@ -0,0 +1,158 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:57 +0100
+Subject: [PATCH] futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()
+
+Upstream commit 38d589f2fd08f1296aea3ce62bebd185125c6d81
+
+With the ultimate goal of keeping rt_mutex wait_list and futex_q waiters
+consistent it's necessary to split 'rt_mutex_futex_lock()' into finer
+parts, such that only the actual blocking can be done without hb->lock
+held.
+
+Split split_mutex_finish_proxy_lock() into two parts, one that does the
+blocking and one that does remove_waiter() when the lock acquire failed.
+
+When the rtmutex was acquired successfully the waiter can be removed in the
+acquisiton path safely, since there is no concurrency on the lock owner.
+
+This means that, except for futex_lock_pi(), all wait_list modifications
+are done with both hb->lock and wait_lock held.
+
+[bigeasy@linutronix.de: fix for futex_requeue_pi_signal_restart]
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.001659630@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 7 +++--
+ kernel/locking/rtmutex.c | 52 ++++++++++++++++++++++++++++++++++------
+ kernel/locking/rtmutex_common.h | 8 +++---
+ 3 files changed, 55 insertions(+), 12 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -3030,10 +3030,13 @@ static int futex_wait_requeue_pi(u32 __u
+ */
+ WARN_ON(!q.pi_state);
+ pi_mutex = &q.pi_state->pi_mutex;
+- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
+- debug_rt_mutex_free_waiter(&rt_waiter);
++ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
+
+ spin_lock(q.lock_ptr);
++ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
++ ret = 0;
++
++ debug_rt_mutex_free_waiter(&rt_waiter);
+ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1743,21 +1743,23 @@ struct task_struct *rt_mutex_next_owner(
+ }
+
+ /**
+- * rt_mutex_finish_proxy_lock() - Complete lock acquisition
++ * rt_mutex_wait_proxy_lock() - Wait for lock acquisition
+ * @lock: the rt_mutex we were woken on
+ * @to: the timeout, null if none. hrtimer should already have
+ * been started.
+ * @waiter: the pre-initialized rt_mutex_waiter
+ *
+- * Complete the lock acquisition started our behalf by another thread.
++ * Wait for the the lock acquisition started on our behalf by
++ * rt_mutex_start_proxy_lock(). Upon failure, the caller must call
++ * rt_mutex_cleanup_proxy_lock().
+ *
+ * Returns:
+ * 0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT
+ *
+- * Special API call for PI-futex requeue support
++ * Special API call for PI-futex support
+ */
+-int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
++int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
+ struct hrtimer_sleeper *to,
+ struct rt_mutex_waiter *waiter)
+ {
+@@ -1770,9 +1772,6 @@ int rt_mutex_finish_proxy_lock(struct rt
+ /* sleep on the mutex */
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+
+- if (unlikely(ret))
+- remove_waiter(lock, waiter);
+-
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+@@ -1783,3 +1782,42 @@ int rt_mutex_finish_proxy_lock(struct rt
+
+ return ret;
+ }
++
++/**
++ * rt_mutex_cleanup_proxy_lock() - Cleanup failed lock acquisition
++ * @lock: the rt_mutex we were woken on
++ * @waiter: the pre-initialized rt_mutex_waiter
++ *
++ * Attempt to clean up after a failed rt_mutex_wait_proxy_lock().
++ *
++ * Unless we acquired the lock; we're still enqueued on the wait-list and can
++ * in fact still be granted ownership until we're removed. Therefore we can
++ * find we are in fact the owner and must disregard the
++ * rt_mutex_wait_proxy_lock() failure.
++ *
++ * Returns:
++ * true - did the cleanup, we done.
++ * false - we acquired the lock after rt_mutex_wait_proxy_lock() returned,
++ * caller should disregards its return value.
++ *
++ * Special API call for PI-futex support
++ */
++bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
++ struct rt_mutex_waiter *waiter)
++{
++ bool cleanup = false;
++
++ raw_spin_lock_irq(&lock->wait_lock);
++ /*
++ * Unless we're the owner; we're still enqueued on the wait_list.
++ * So check if we became owner, if not, take us off the wait_list.
++ */
++ if (rt_mutex_owner(lock) != current) {
++ remove_waiter(lock, waiter);
++ fixup_rt_mutex_waiters(lock);
++ cleanup = true;
++ }
++ raw_spin_unlock_irq(&lock->wait_lock);
++
++ return cleanup;
++}
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -107,9 +107,11 @@ extern void rt_mutex_init_waiter(struct
+ extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+-extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+- struct hrtimer_sleeper *to,
+- struct rt_mutex_waiter *waiter);
++extern int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
++ struct hrtimer_sleeper *to,
++ struct rt_mutex_waiter *waiter);
++extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
++ struct rt_mutex_waiter *waiter);
+
+ extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+ extern int rt_mutex_futex_trylock(struct rt_mutex *l);
diff --git a/patches/0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch b/patches/0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch
new file mode 100644
index 000000000000..84017cee0304
--- /dev/null
+++ b/patches/0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch
@@ -0,0 +1,266 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:58 +0100
+Subject: [PATCH] futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()
+
+Upstream commit cfafcd117da0216520568c195cb2f6cd1980c4bb
+
+By changing futex_lock_pi() to use rt_mutex_*_proxy_lock() all wait_list
+modifications are done under both hb->lock and wait_lock.
+
+This closes the obvious interleave pattern between futex_lock_pi() and
+futex_unlock_pi(), but not entirely so. See below:
+
+Before:
+
+futex_lock_pi() futex_unlock_pi()
+ unlock hb->lock
+
+ lock hb->lock
+ unlock hb->lock
+
+ lock rt_mutex->wait_lock
+ unlock rt_mutex_wait_lock
+ -EAGAIN
+
+ lock rt_mutex->wait_lock
+ list_add
+ unlock rt_mutex->wait_lock
+
+ schedule()
+
+ lock rt_mutex->wait_lock
+ list_del
+ unlock rt_mutex->wait_lock
+
+ <idem>
+ -EAGAIN
+
+ lock hb->lock
+
+
+After:
+
+futex_lock_pi() futex_unlock_pi()
+
+ lock hb->lock
+ lock rt_mutex->wait_lock
+ list_add
+ unlock rt_mutex->wait_lock
+ unlock hb->lock
+
+ schedule()
+ lock hb->lock
+ unlock hb->lock
+ lock hb->lock
+ lock rt_mutex->wait_lock
+ list_del
+ unlock rt_mutex->wait_lock
+
+ lock rt_mutex->wait_lock
+ unlock rt_mutex_wait_lock
+ -EAGAIN
+
+ unlock hb->lock
+
+
+It does however solve the earlier starvation/live-lock scenario which got
+introduced with the -EAGAIN since unlike the before scenario; where the
+-EAGAIN happens while futex_unlock_pi() doesn't hold any locks; in the
+after scenario it happens while futex_unlock_pi() actually holds a lock,
+and then it is serialized on that lock.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.062785528@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 77 ++++++++++++++++++++++++++++------------
+ kernel/locking/rtmutex.c | 26 +++----------
+ kernel/locking/rtmutex_common.h | 1
+ 3 files changed, 62 insertions(+), 42 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2097,20 +2097,7 @@ queue_unlock(struct futex_hash_bucket *h
+ hb_waiters_dec(hb);
+ }
+
+-/**
+- * queue_me() - Enqueue the futex_q on the futex_hash_bucket
+- * @q: The futex_q to enqueue
+- * @hb: The destination hash bucket
+- *
+- * The hb->lock must be held by the caller, and is released here. A call to
+- * queue_me() is typically paired with exactly one call to unqueue_me(). The
+- * exceptions involve the PI related operations, which may use unqueue_me_pi()
+- * or nothing if the unqueue is done as part of the wake process and the unqueue
+- * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
+- * an example).
+- */
+-static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+- __releases(&hb->lock)
++static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+ {
+ int prio;
+
+@@ -2127,6 +2114,24 @@ static inline void queue_me(struct futex
+ plist_node_init(&q->list, prio);
+ plist_add(&q->list, &hb->chain);
+ q->task = current;
++}
++
++/**
++ * queue_me() - Enqueue the futex_q on the futex_hash_bucket
++ * @q: The futex_q to enqueue
++ * @hb: The destination hash bucket
++ *
++ * The hb->lock must be held by the caller, and is released here. A call to
++ * queue_me() is typically paired with exactly one call to unqueue_me(). The
++ * exceptions involve the PI related operations, which may use unqueue_me_pi()
++ * or nothing if the unqueue is done as part of the wake process and the unqueue
++ * state is implicit in the state of woken task (see futex_wait_requeue_pi() for
++ * an example).
++ */
++static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
++ __releases(&hb->lock)
++{
++ __queue_me(q, hb);
+ spin_unlock(&hb->lock);
+ }
+
+@@ -2585,6 +2590,7 @@ static int futex_lock_pi(u32 __user *uad
+ {
+ struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
++ struct rt_mutex_waiter rt_waiter;
+ struct futex_hash_bucket *hb;
+ struct futex_q q = futex_q_init;
+ int res, ret;
+@@ -2637,25 +2643,52 @@ static int futex_lock_pi(u32 __user *uad
+ }
+ }
+
++ WARN_ON(!q.pi_state);
++
+ /*
+ * Only actually queue now that the atomic ops are done:
+ */
+- queue_me(&q, hb);
++ __queue_me(&q, hb);
+
+- WARN_ON(!q.pi_state);
+- /*
+- * Block on the PI mutex:
+- */
+- if (!trylock) {
+- ret = rt_mutex_timed_futex_lock(&q.pi_state->pi_mutex, to);
+- } else {
++ if (trylock) {
+ ret = rt_mutex_futex_trylock(&q.pi_state->pi_mutex);
+ /* Fixup the trylock return value: */
+ ret = ret ? 0 : -EWOULDBLOCK;
++ goto no_block;
+ }
+
++ /*
++ * We must add ourselves to the rt_mutex waitlist while holding hb->lock
++ * such that the hb and rt_mutex wait lists match.
++ */
++ rt_mutex_init_waiter(&rt_waiter);
++ ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++ if (ret) {
++ if (ret == 1)
++ ret = 0;
++
++ goto no_block;
++ }
++
++ spin_unlock(q.lock_ptr);
++
++ if (unlikely(to))
++ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
++
++ ret = rt_mutex_wait_proxy_lock(&q.pi_state->pi_mutex, to, &rt_waiter);
++
+ spin_lock(q.lock_ptr);
+ /*
++ * If we failed to acquire the lock (signal/timeout), we must
++ * first acquire the hb->lock before removing the lock from the
++ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
++ * wait lists consistent.
++ */
++ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
++ ret = 0;
++
++no_block:
++ /*
+ * Fixup the pi_state owner and possibly acquire the lock if we
+ * haven't already.
+ */
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1491,19 +1491,6 @@ int __sched rt_mutex_lock_interruptible(
+ EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
+
+ /*
+- * Futex variant with full deadlock detection.
+- * Futex variants must not use the fast-path, see __rt_mutex_futex_unlock().
+- */
+-int __sched rt_mutex_timed_futex_lock(struct rt_mutex *lock,
+- struct hrtimer_sleeper *timeout)
+-{
+- might_sleep();
+-
+- return rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE,
+- timeout, RT_MUTEX_FULL_CHAINWALK);
+-}
+-
+-/*
+ * Futex variant, must not use fastpath.
+ */
+ int __sched rt_mutex_futex_trylock(struct rt_mutex *lock)
+@@ -1772,12 +1759,6 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+ /* sleep on the mutex */
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+
+- /*
+- * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+- * have to fix that up.
+- */
+- fixup_rt_mutex_waiters(lock);
+-
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+@@ -1817,6 +1798,13 @@ bool rt_mutex_cleanup_proxy_lock(struct
+ fixup_rt_mutex_waiters(lock);
+ cleanup = true;
+ }
++
++ /*
++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
++ * have to fix that up.
++ */
++ fixup_rt_mutex_waiters(lock);
++
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return cleanup;
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -113,7 +113,6 @@ extern int rt_mutex_wait_proxy_lock(stru
+ extern bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter);
+
+-extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
+ extern int rt_mutex_futex_trylock(struct rt_mutex *l);
+
+ extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
diff --git a/patches/0012-futex-Futex_unlock_pi-determinism.patch b/patches/0012-futex-Futex_unlock_pi-determinism.patch
new file mode 100644
index 000000000000..9cd5ce650ff6
--- /dev/null
+++ b/patches/0012-futex-Futex_unlock_pi-determinism.patch
@@ -0,0 +1,80 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:35:59 +0100
+Subject: [PATCH] futex: Futex_unlock_pi() determinism
+
+Upstream commit bebe5b514345f09be2c15e414d076b02ecb9cce8
+
+The problem with returning -EAGAIN when the waiter state mismatches is that
+it becomes very hard to proof a bounded execution time on the
+operation. And seeing that this is a RT operation, this is somewhat
+important.
+
+While in practise; given the previous patch; it will be very unlikely to
+ever really take more than one or two rounds, proving so becomes rather
+hard.
+
+However, now that modifying wait_list is done while holding both hb->lock
+and wait_lock, the scenario can be avoided entirely by acquiring wait_lock
+while still holding hb-lock. Doing a hand-over, without leaving a hole.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: bigeasy@linutronix.de
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.112378812@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 24 +++++++++++-------------
+ 1 file changed, 11 insertions(+), 13 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -1396,15 +1396,10 @@ static int wake_futex_pi(u32 __user *uad
+ WAKE_Q(wake_q);
+ int ret = 0;
+
+- raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+- if (!new_owner) {
++ if (WARN_ON_ONCE(!new_owner)) {
+ /*
+- * Since we held neither hb->lock nor wait_lock when coming
+- * into this function, we could have raced with futex_lock_pi()
+- * such that we might observe @this futex_q waiter, but the
+- * rt_mutex's wait_list can be empty (either still, or again,
+- * depending on which side we land).
++ * As per the comment in futex_unlock_pi() this should not happen.
+ *
+ * When this happens, give up our locks and try again, giving
+ * the futex_lock_pi() instance time to complete, either by
+@@ -2792,15 +2787,18 @@ static int futex_unlock_pi(u32 __user *u
+ if (pi_state->owner != current)
+ goto out_unlock;
+
++ get_pi_state(pi_state);
+ /*
+- * Grab a reference on the pi_state and drop hb->lock.
++ * Since modifying the wait_list is done while holding both
++ * hb->lock and wait_lock, holding either is sufficient to
++ * observe it.
+ *
+- * The reference ensures pi_state lives, dropping the hb->lock
+- * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to
+- * close the races against futex_lock_pi(), but in case of
+- * _any_ fail we'll abort and retry the whole deal.
++ * By taking wait_lock while still holding hb->lock, we ensure
++ * there is no point where we hold neither; and therefore
++ * wake_futex_pi() must observe a state consistent with what we
++ * observed.
+ */
+- get_pi_state(pi_state);
++ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+ spin_unlock(&hb->lock);
+
+ ret = wake_futex_pi(uaddr, uval, pi_state);
diff --git a/patches/0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch b/patches/0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch
new file mode 100644
index 000000000000..2128174f26cd
--- /dev/null
+++ b/patches/0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch
@@ -0,0 +1,203 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Wed, 22 Mar 2017 11:36:00 +0100
+Subject: [PATCH] futex: Drop hb->lock before enqueueing on the rtmutex
+
+Upstream commit 56222b212e8edb1cf51f5dd73ff645809b082b40
+
+When PREEMPT_RT_FULL does the spinlock -> rt_mutex substitution the PI
+chain code will (falsely) report a deadlock and BUG.
+
+The problem is that it hold hb->lock (now an rt_mutex) while doing
+task_blocks_on_rt_mutex on the futex's pi_state::rtmutex. This, when
+interleaved just right with futex_unlock_pi() leads it to believe to see an
+AB-BA deadlock.
+
+ Task1 (holds rt_mutex, Task2 (does FUTEX_LOCK_PI)
+ does FUTEX_UNLOCK_PI)
+
+ lock hb->lock
+ lock rt_mutex (as per start_proxy)
+ lock hb->lock
+
+Which is a trivial AB-BA.
+
+It is not an actual deadlock, because it won't be holding hb->lock by the
+time it actually blocks on the rt_mutex, but the chainwalk code doesn't
+know that and it would be a nightmare to handle this gracefully.
+
+To avoid this problem, do the same as in futex_unlock_pi() and drop
+hb->lock after acquiring wait_lock. This still fully serializes against
+futex_unlock_pi(), since adding to the wait_list does the very same lock
+dance, and removing it holds both locks.
+
+Aside of solving the RT problem this makes the lock and unlock mechanism
+symetric and reduces the hb->lock held time.
+
+Reported-and-tested-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Suggested-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: juri.lelli@arm.com
+Cc: xlpang@redhat.com
+Cc: rostedt@goodmis.org
+Cc: mathieu.desnoyers@efficios.com
+Cc: jdesfossez@efficios.com
+Cc: dvhart@infradead.org
+Cc: bristot@redhat.com
+Link: http://lkml.kernel.org/r/20170322104152.161341537@infradead.org
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 30 +++++++++++++++++-------
+ kernel/locking/rtmutex.c | 49 ++++++++++++++++++++++------------------
+ kernel/locking/rtmutex_common.h | 3 ++
+ 3 files changed, 52 insertions(+), 30 deletions(-)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2652,20 +2652,33 @@ static int futex_lock_pi(u32 __user *uad
+ goto no_block;
+ }
+
++ rt_mutex_init_waiter(&rt_waiter);
++
+ /*
+- * We must add ourselves to the rt_mutex waitlist while holding hb->lock
+- * such that the hb and rt_mutex wait lists match.
++ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
++ * hold it while doing rt_mutex_start_proxy(), because then it will
++ * include hb->lock in the blocking chain, even through we'll not in
++ * fact hold it while blocking. This will lead it to report -EDEADLK
++ * and BUG when futex_unlock_pi() interleaves with this.
++ *
++ * Therefore acquire wait_lock while holding hb->lock, but drop the
++ * latter before calling rt_mutex_start_proxy_lock(). This still fully
++ * serializes against futex_unlock_pi() as that does the exact same
++ * lock handoff sequence.
+ */
+- rt_mutex_init_waiter(&rt_waiter);
+- ret = rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
++ spin_unlock(q.lock_ptr);
++ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
++ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
++
+ if (ret) {
+ if (ret == 1)
+ ret = 0;
+
++ spin_lock(q.lock_ptr);
+ goto no_block;
+ }
+
+- spin_unlock(q.lock_ptr);
+
+ if (unlikely(to))
+ hrtimer_start_expires(&to->timer, HRTIMER_MODE_ABS);
+@@ -2678,6 +2691,9 @@ static int futex_lock_pi(u32 __user *uad
+ * first acquire the hb->lock before removing the lock from the
+ * rt_mutex waitqueue, such that we can keep the hb and rt_mutex
+ * wait lists consistent.
++ *
++ * In particular; it is important that futex_unlock_pi() can not
++ * observe this inconsistency.
+ */
+ if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
+ ret = 0;
+@@ -2789,10 +2805,6 @@ static int futex_unlock_pi(u32 __user *u
+
+ get_pi_state(pi_state);
+ /*
+- * Since modifying the wait_list is done while holding both
+- * hb->lock and wait_lock, holding either is sufficient to
+- * observe it.
+- *
+ * By taking wait_lock while still holding hb->lock, we ensure
+ * there is no point where we hold neither; and therefore
+ * wake_futex_pi() must observe a state consistent with what we
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1659,31 +1659,14 @@ void rt_mutex_proxy_unlock(struct rt_mut
+ rt_mutex_set_owner(lock, NULL);
+ }
+
+-/**
+- * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+- * @lock: the rt_mutex to take
+- * @waiter: the pre-initialized rt_mutex_waiter
+- * @task: the task to prepare
+- *
+- * Returns:
+- * 0 - task blocked on lock
+- * 1 - acquired the lock for task, caller should wake it up
+- * <0 - error
+- *
+- * Special API call for FUTEX_REQUEUE_PI support.
+- */
+-int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task)
+ {
+ int ret;
+
+- raw_spin_lock_irq(&lock->wait_lock);
+-
+- if (try_to_take_rt_mutex(lock, task, NULL)) {
+- raw_spin_unlock_irq(&lock->wait_lock);
++ if (try_to_take_rt_mutex(lock, task, NULL))
+ return 1;
+- }
+
+ /* We enforce deadlock detection for futexes */
+ ret = task_blocks_on_rt_mutex(lock, waiter, task,
+@@ -1702,12 +1685,36 @@ int rt_mutex_start_proxy_lock(struct rt_
+ if (unlikely(ret))
+ remove_waiter(lock, waiter);
+
+- raw_spin_unlock_irq(&lock->wait_lock);
+-
+ debug_rt_mutex_print_deadlock(waiter);
+
+ return ret;
+ }
++
++/**
++ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
++ * @lock: the rt_mutex to take
++ * @waiter: the pre-initialized rt_mutex_waiter
++ * @task: the task to prepare
++ *
++ * Returns:
++ * 0 - task blocked on lock
++ * 1 - acquired the lock for task, caller should wake it up
++ * <0 - error
++ *
++ * Special API call for FUTEX_REQUEUE_PI support.
++ */
++int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++ struct rt_mutex_waiter *waiter,
++ struct task_struct *task)
++{
++ int ret;
++
++ raw_spin_lock_irq(&lock->wait_lock);
++ ret = __rt_mutex_start_proxy_lock(lock, waiter, task);
++ raw_spin_unlock_irq(&lock->wait_lock);
++
++ return ret;
++}
+
+ /**
+ * rt_mutex_next_owner - return the next owner of the lock
+--- a/kernel/locking/rtmutex_common.h
++++ b/kernel/locking/rtmutex_common.h
+@@ -104,6 +104,9 @@ extern void rt_mutex_init_proxy_locked(s
+ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+ extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
++extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
++ struct rt_mutex_waiter *waiter,
++ struct task_struct *task);
+ extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
diff --git a/patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch b/patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch
index 415a19d256ab..d0443c81bdaa 100644
--- a/patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch
+++ b/patches/futex-Ensure-lock-unlock-symetry-versus-pi_lock-and-.patch
@@ -30,7 +30,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/futex.c
+++ b/kernel/futex.c
-@@ -904,7 +904,9 @@ void exit_pi_state_list(struct task_stru
+@@ -909,7 +909,9 @@ void exit_pi_state_list(struct task_stru
* task still owns the PI-state:
*/
if (head->next != next) {
diff --git a/patches/futex-requeue-pi-fix.patch b/patches/futex-requeue-pi-fix.patch
index 2719fd7e8926..e87a4fa978fc 100644
--- a/patches/futex-requeue-pi-fix.patch
+++ b/patches/futex-requeue-pi-fix.patch
@@ -65,9 +65,9 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
/*
-@@ -1704,6 +1705,35 @@ int rt_mutex_start_proxy_lock(struct rt_
+@@ -1696,6 +1697,35 @@ int __rt_mutex_start_proxy_lock(struct r
+ if (try_to_take_rt_mutex(lock, task, NULL))
return 1;
- }
+#ifdef CONFIG_PREEMPT_RT_FULL
+ /*
diff --git a/patches/futex-workaround-migrate_disable-enable-in-different.patch b/patches/futex-workaround-migrate_disable-enable-in-different.patch
new file mode 100644
index 000000000000..135c59df93c4
--- /dev/null
+++ b/patches/futex-workaround-migrate_disable-enable-in-different.patch
@@ -0,0 +1,58 @@
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Wed, 8 Mar 2017 14:23:35 +0100
+Subject: [PATCH] futex: workaround migrate_disable/enable in different context
+
+migrate_disable()/migrate_enable() takes a different path in atomic() vs
+!atomic() context. These little hacks ensure that we don't underflow / overflow
+the migrate code counts properly while we lock the hb lockwith interrupts
+enabled and unlock it with interrupts disabled.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/futex.c | 20 ++++++++++++++++++++
+ 1 file changed, 20 insertions(+)
+
+--- a/kernel/futex.c
++++ b/kernel/futex.c
+@@ -2667,9 +2667,18 @@ static int futex_lock_pi(u32 __user *uad
+ * lock handoff sequence.
+ */
+ raw_spin_lock_irq(&q.pi_state->pi_mutex.wait_lock);
++ /*
++ * the migrate_disable() here disables migration in the in_atomic() fast
++ * path which is enabled again in the following spin_unlock(). We have
++ * one migrate_disable() pending in the slow-path which is reversed
++ * after the raw_spin_unlock_irq() where we leave the atomic context.
++ */
++ migrate_disable();
++
+ spin_unlock(q.lock_ptr);
+ ret = __rt_mutex_start_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter, current);
+ raw_spin_unlock_irq(&q.pi_state->pi_mutex.wait_lock);
++ migrate_enable();
+
+ if (ret) {
+ if (ret == 1)
+@@ -2811,10 +2820,21 @@ static int futex_unlock_pi(u32 __user *u
+ * observed.
+ */
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
++ /*
++ * Magic trickery for now to make the RT migrate disable
++ * logic happy. The following spin_unlock() happens with
++ * interrupts disabled so the internal migrate_enable()
++ * won't undo the migrate_disable() which was issued when
++ * locking hb->lock.
++ */
++ migrate_disable();
+ spin_unlock(&hb->lock);
+
++ /* Drops pi_state->pi_mutex.wait_lock */
+ ret = wake_futex_pi(uaddr, uval, pi_state);
+
++ migrate_enable();
++
+ put_pi_state(pi_state);
+
+ /*
diff --git a/patches/introduce_migrate_disable_cpu_light.patch b/patches/introduce_migrate_disable_cpu_light.patch
index a4dd649cf76e..5eda023568c6 100644
--- a/patches/introduce_migrate_disable_cpu_light.patch
+++ b/patches/introduce_migrate_disable_cpu_light.patch
@@ -138,7 +138,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
-@@ -185,6 +185,9 @@ static inline void smp_init(void) { }
+@@ -197,6 +197,9 @@ static inline int get_boot_cpu_id(void)
#define get_cpu() ({ preempt_disable(); smp_processor_id(); })
#define put_cpu() preempt_enable()
diff --git a/patches/kernel-futex-don-t-deboost-too-early.patch b/patches/kernel-futex-don-t-deboost-too-early.patch
deleted file mode 100644
index d902342dc9f1..000000000000
--- a/patches/kernel-futex-don-t-deboost-too-early.patch
+++ /dev/null
@@ -1,161 +0,0 @@
-From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-Date: Thu, 29 Sep 2016 18:49:22 +0200
-Subject: [PATCH] kernel/futex: don't deboost too early
-
-The sequence:
- T1 holds futex
- T2 blocks on futex and boosts T1
- T1 unlocks futex and holds hb->lock
- T1 unlocks rt mutex, so T1 has no more pi waiters
- T3 blocks on hb->lock and adds itself to the pi waiters list of T1
- T1 unlocks hb->lock and deboosts itself
- T4 preempts T1 so the wakeup of T2 gets delayed
-
-As a workaround I attempt here do unlock the hb->lock without a deboost
-and perform the deboost after the wake up of the waiter.
-
-Cc: stable-rt@vger.kernel.org
-Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
----
- include/linux/spinlock.h | 6 ++++
- include/linux/spinlock_rt.h | 2 +
- kernel/futex.c | 2 -
- kernel/locking/rtmutex.c | 53 ++++++++++++++++++++++++++++++++++++++------
- 4 files changed, 55 insertions(+), 8 deletions(-)
-
---- a/include/linux/spinlock.h
-+++ b/include/linux/spinlock.h
-@@ -355,6 +355,12 @@ static __always_inline void spin_unlock(
- raw_spin_unlock(&lock->rlock);
- }
-
-+static __always_inline int spin_unlock_no_deboost(spinlock_t *lock)
-+{
-+ raw_spin_unlock(&lock->rlock);
-+ return 0;
-+}
-+
- static __always_inline void spin_unlock_bh(spinlock_t *lock)
- {
- raw_spin_unlock_bh(&lock->rlock);
---- a/include/linux/spinlock_rt.h
-+++ b/include/linux/spinlock_rt.h
-@@ -26,6 +26,7 @@ extern void __lockfunc rt_spin_lock(spin
- extern unsigned long __lockfunc rt_spin_lock_trace_flags(spinlock_t *lock);
- extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass);
- extern void __lockfunc rt_spin_unlock(spinlock_t *lock);
-+extern int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock);
- extern void __lockfunc rt_spin_unlock_wait(spinlock_t *lock);
- extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
- extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock);
-@@ -111,6 +112,7 @@ static inline unsigned long spin_lock_tr
- #define spin_lock_nest_lock(lock, nest_lock) spin_lock_nested(lock, 0)
-
- #define spin_unlock(lock) rt_spin_unlock(lock)
-+#define spin_unlock_no_deboost(lock) rt_spin_unlock_no_deboost(lock)
-
- #define spin_unlock_bh(lock) \
- do { \
---- a/kernel/futex.c
-+++ b/kernel/futex.c
-@@ -1377,7 +1377,7 @@ static int wake_futex_pi(u32 __user *uad
- * deboost first (and lose our higher priority), then the task might get
- * scheduled away before the wake up can take place.
- */
-- spin_unlock(&hb->lock);
-+ deboost |= spin_unlock_no_deboost(&hb->lock);
- wake_up_q(&wake_q);
- wake_up_q_sleeper(&wake_sleeper_q);
- if (deboost)
---- a/kernel/locking/rtmutex.c
-+++ b/kernel/locking/rtmutex.c
-@@ -997,13 +997,14 @@ static inline void rt_spin_lock_fastlock
- slowfn(lock);
- }
-
--static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
-- void (*slowfn)(struct rt_mutex *lock))
-+static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
-+ int (*slowfn)(struct rt_mutex *lock))
- {
-- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
-+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
-- else
-- slowfn(lock);
-+ return 0;
-+ }
-+ return slowfn(lock);
- }
- #ifdef CONFIG_SMP
- /*
-@@ -1138,7 +1139,7 @@ static void mark_wakeup_next_waiter(stru
- /*
- * Slow path to release a rt_mutex spin_lock style
- */
--static void noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
-+static int noinline __sched rt_spin_lock_slowunlock(struct rt_mutex *lock)
- {
- unsigned long flags;
- WAKE_Q(wake_q);
-@@ -1153,7 +1154,7 @@ static void noinline __sched rt_spin_lo
- if (!rt_mutex_has_waiters(lock)) {
- lock->owner = NULL;
- raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-- return;
-+ return 0;
- }
-
- mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
-@@ -1164,6 +1165,33 @@ static void noinline __sched rt_spin_lo
-
- /* Undo pi boosting.when necessary */
- rt_mutex_adjust_prio(current);
-+ return 0;
-+}
-+
-+static int noinline __sched rt_spin_lock_slowunlock_no_deboost(struct rt_mutex *lock)
-+{
-+ unsigned long flags;
-+ WAKE_Q(wake_q);
-+ WAKE_Q(wake_sleeper_q);
-+
-+ raw_spin_lock_irqsave(&lock->wait_lock, flags);
-+
-+ debug_rt_mutex_unlock(lock);
-+
-+ rt_mutex_deadlock_account_unlock(current);
-+
-+ if (!rt_mutex_has_waiters(lock)) {
-+ lock->owner = NULL;
-+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-+ return 0;
-+ }
-+
-+ mark_wakeup_next_waiter(&wake_q, &wake_sleeper_q, lock);
-+
-+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-+ wake_up_q(&wake_q);
-+ wake_up_q_sleeper(&wake_sleeper_q);
-+ return 1;
- }
-
- void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
-@@ -1221,6 +1249,17 @@ void __lockfunc rt_spin_unlock(spinlock_
- }
- EXPORT_SYMBOL(rt_spin_unlock);
-
-+int __lockfunc rt_spin_unlock_no_deboost(spinlock_t *lock)
-+{
-+ int ret;
-+
-+ /* NOTE: we always pass in '1' for nested, for simplicity */
-+ spin_release(&lock->dep_map, 1, _RET_IP_);
-+ ret = rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock_no_deboost);
-+ migrate_enable();
-+ return ret;
-+}
-+
- void __lockfunc __rt_spin_unlock(struct rt_mutex *lock)
- {
- rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock);
diff --git a/patches/localversion.patch b/patches/localversion.patch
index 25e5fadbaae8..e1f3b8d87864 100644
--- a/patches/localversion.patch
+++ b/patches/localversion.patch
@@ -10,4 +10,4 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
-+-rt13
++-rt14
diff --git a/patches/lockdep-Fix-per-cpu-static-objects.patch b/patches/lockdep-Fix-per-cpu-static-objects.patch
new file mode 100644
index 000000000000..b795b1481c55
--- /dev/null
+++ b/patches/lockdep-Fix-per-cpu-static-objects.patch
@@ -0,0 +1,124 @@
+From 8ce371f9846ef1e8b3cc8f6865766cb5c1f17e40 Mon Sep 17 00:00:00 2001
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Mon, 20 Mar 2017 12:26:55 +0100
+Subject: [PATCH] lockdep: Fix per-cpu static objects
+
+Since commit 383776fa7527 ("locking/lockdep: Handle statically initialized
+PER_CPU locks properly") we try to collapse per-cpu locks into a single
+class by giving them all the same key. For this key we choose the canonical
+address of the per-cpu object, which would be the offset into the per-cpu
+area.
+
+This has two problems:
+
+ - there is a case where we run !0 lock->key through static_obj() and
+ expect this to pass; it doesn't for canonical pointers.
+
+ - 0 is a valid canonical address.
+
+Cure both issues by redefining the canonical address as the address of the
+per-cpu variable on the boot CPU.
+
+Since I didn't want to rely on CPU0 being the boot-cpu, or even existing at
+all, track the boot CPU in a variable.
+
+Fixes: 383776fa7527 ("locking/lockdep: Handle statically initialized PER_CPU locks properly")
+Reported-by: kernel test robot <fengguang.wu@intel.com>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Tested-by: Borislav Petkov <bp@suse.de>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: linux-mm@kvack.org
+Cc: wfg@linux.intel.com
+Cc: kernel test robot <fengguang.wu@intel.com>
+Cc: LKP <lkp@01.org>
+Link: http://lkml.kernel.org/r/20170320114108.kbvcsuepem45j5cr@hirez.programming.kicks-ass.net
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ include/linux/smp.h | 12 ++++++++++++
+ kernel/cpu.c | 6 ++++++
+ kernel/module.c | 6 +++++-
+ mm/percpu.c | 5 ++++-
+ 4 files changed, 27 insertions(+), 2 deletions(-)
+
+--- a/include/linux/smp.h
++++ b/include/linux/smp.h
+@@ -120,6 +120,13 @@ extern unsigned int setup_max_cpus;
+ extern void __init setup_nr_cpu_ids(void);
+ extern void __init smp_init(void);
+
++extern int __boot_cpu_id;
++
++static inline int get_boot_cpu_id(void)
++{
++ return __boot_cpu_id;
++}
++
+ #else /* !SMP */
+
+ static inline void smp_send_stop(void) { }
+@@ -158,6 +165,11 @@ static inline void smp_init(void) { up_l
+ static inline void smp_init(void) { }
+ #endif
+
++static inline int get_boot_cpu_id(void)
++{
++ return 0;
++}
++
+ #endif /* !SMP */
+
+ /*
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -1240,6 +1240,8 @@ core_initcall(cpu_hotplug_pm_sync_init);
+
+ #endif /* CONFIG_PM_SLEEP_SMP */
+
++int __boot_cpu_id;
++
+ #endif /* CONFIG_SMP */
+
+ /* Boot processor state steps */
+@@ -1923,6 +1925,10 @@ void __init boot_cpu_init(void)
+ set_cpu_active(cpu, true);
+ set_cpu_present(cpu, true);
+ set_cpu_possible(cpu, true);
++
++#ifdef CONFIG_SMP
++ __boot_cpu_id = cpu;
++#endif
+ }
+
+ /*
+--- a/kernel/module.c
++++ b/kernel/module.c
+@@ -677,8 +677,12 @@ bool __is_module_percpu_address(unsigned
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + mod->percpu_size) {
+- if (can_addr)
++ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
++ *can_addr += (unsigned long)
++ per_cpu_ptr(mod->percpu,
++ get_boot_cpu_id());
++ }
+ preempt_enable();
+ return true;
+ }
+--- a/mm/percpu.c
++++ b/mm/percpu.c
+@@ -1295,8 +1295,11 @@ bool __is_kernel_percpu_address(unsigned
+ void *va = (void *)addr;
+
+ if (va >= start && va < start + static_size) {
+- if (can_addr)
++ if (can_addr) {
+ *can_addr = (unsigned long) (va - start);
++ *can_addr += (unsigned long)
++ per_cpu_ptr(base, get_boot_cpu_id());
++ }
+ return true;
+ }
+ }
diff --git a/patches/rt-add-rt-locks.patch b/patches/rt-add-rt-locks.patch
index 274cfb6cc3bb..c5cd8758c714 100644
--- a/patches/rt-add-rt-locks.patch
+++ b/patches/rt-add-rt-locks.patch
@@ -24,15 +24,15 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
include/linux/spinlock_rt.h | 162 ++++++++++++
include/linux/spinlock_types.h | 11
include/linux/spinlock_types_rt.h | 48 +++
- kernel/futex.c | 10
+ kernel/futex.c | 9
kernel/locking/Makefile | 9
kernel/locking/rt.c | 498 ++++++++++++++++++++++++++++++++++++++
- kernel/locking/rtmutex.c | 460 +++++++++++++++++++++++++++++++++--
- kernel/locking/rtmutex_common.h | 14 -
+ kernel/locking/rtmutex.c | 463 +++++++++++++++++++++++++++++++++--
+ kernel/locking/rtmutex_common.h | 6
kernel/locking/spinlock.c | 7
kernel/locking/spinlock_debug.c | 5
kernel/sched/core.c | 7
- 23 files changed, 1658 insertions(+), 56 deletions(-)
+ 23 files changed, 1653 insertions(+), 55 deletions(-)
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -915,40 +915,45 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+#endif
--- a/kernel/futex.c
+++ b/kernel/futex.c
-@@ -1301,6 +1301,7 @@ static int wake_futex_pi(u32 __user *uad
- struct futex_pi_state *pi_state = this->pi_state;
- u32 uninitialized_var(curval), newval;
+@@ -1396,6 +1396,7 @@ static int wake_futex_pi(u32 __user *uad
+ struct task_struct *new_owner;
+ bool deboost = false;
WAKE_Q(wake_q);
+ WAKE_Q(wake_sleeper_q);
- bool deboost;
int ret = 0;
-@@ -1367,7 +1368,8 @@ static int wake_futex_pi(u32 __user *uad
+ new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
+@@ -1455,13 +1456,15 @@ static int wake_futex_pi(u32 __user *uad
+ /*
+ * We've updated the uservalue, this unlock cannot fail.
+ */
+- deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
++ deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
++ &wake_sleeper_q);
+ out_unlock:
raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-- deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
-+ deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q,
-+ &wake_sleeper_q);
-
- /*
- * First unlock HB so the waiter does not spin on it once he got woken
-@@ -1377,6 +1379,7 @@ static int wake_futex_pi(u32 __user *uad
- */
- spin_unlock(&hb->lock);
- wake_up_q(&wake_q);
-+ wake_up_q_sleeper(&wake_sleeper_q);
- if (deboost)
+ if (deboost) {
+ wake_up_q(&wake_q);
++ wake_up_q_sleeper(&wake_sleeper_q);
rt_mutex_adjust_prio(current);
+ }
+
+@@ -2664,7 +2667,7 @@ static int futex_lock_pi(u32 __user *uad
+ goto no_block;
+ }
+
+- rt_mutex_init_waiter(&rt_waiter);
++ rt_mutex_init_waiter(&rt_waiter, false);
-@@ -2850,10 +2853,7 @@ static int futex_wait_requeue_pi(u32 __u
+ /*
+ * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not
+@@ -3029,7 +3032,7 @@ static int futex_wait_requeue_pi(u32 __u
* The waiter is allocated on our stack, manipulated by the requeue
* code while we sleep on uaddr.
*/
-- debug_rt_mutex_init_waiter(&rt_waiter);
-- RB_CLEAR_NODE(&rt_waiter.pi_tree_entry);
-- RB_CLEAR_NODE(&rt_waiter.tree_entry);
-- rt_waiter.task = NULL;
+- rt_mutex_init_waiter(&rt_waiter);
+ rt_mutex_init_waiter(&rt_waiter, false);
ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, VERIFY_WRITE);
@@ -1604,7 +1609,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
/*
* The current top waiter stays enqueued. We
* don't have to change anything in the lock
-@@ -948,6 +982,352 @@ static int try_to_take_rt_mutex(struct r
+@@ -946,6 +980,350 @@ static int try_to_take_rt_mutex(struct r
return 1;
}
@@ -1618,7 +1623,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+ might_sleep_no_state_check();
+
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
-+ rt_mutex_deadlock_account_lock(lock, current);
++ return;
+ else
+ slowfn(lock);
+}
@@ -1627,7 +1632,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+ void (*slowfn)(struct rt_mutex *lock))
+{
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
-+ rt_mutex_deadlock_account_unlock(current);
++ return;
+ else
+ slowfn(lock);
+}
@@ -1774,8 +1779,6 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+
+ debug_rt_mutex_unlock(lock);
+
-+ rt_mutex_deadlock_account_unlock(current);
-+
+ if (!rt_mutex_has_waiters(lock)) {
+ lock->owner = NULL;
+ raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
@@ -1957,7 +1960,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
/*
* Task blocks on lock.
*
-@@ -1060,6 +1440,7 @@ static int task_blocks_on_rt_mutex(struc
+@@ -1058,6 +1436,7 @@ static int task_blocks_on_rt_mutex(struc
* Called with lock->wait_lock held and interrupts disabled.
*/
static void mark_wakeup_next_waiter(struct wake_q_head *wake_q,
@@ -1965,7 +1968,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
struct rt_mutex *lock)
{
struct rt_mutex_waiter *waiter;
-@@ -1088,7 +1469,10 @@ static void mark_wakeup_next_waiter(stru
+@@ -1086,7 +1465,10 @@ static void mark_wakeup_next_waiter(stru
raw_spin_unlock(&current->pi_lock);
@@ -1977,7 +1980,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
/*
-@@ -1169,11 +1553,11 @@ void rt_mutex_adjust_pi(struct task_stru
+@@ -1167,21 +1549,22 @@ void rt_mutex_adjust_pi(struct task_stru
return;
}
next_lock = waiter->lock;
@@ -1990,18 +1993,28 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL,
next_lock, NULL, task);
}
-@@ -1260,9 +1644,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+
+-void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter)
++void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
+ {
+ debug_rt_mutex_init_waiter(waiter);
+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
+ RB_CLEAR_NODE(&waiter->tree_entry);
+ waiter->task = NULL;
++ waiter->savestate = savestate;
+ }
+
+ /**
+@@ -1266,7 +1649,7 @@ rt_mutex_slowlock(struct rt_mutex *lock,
unsigned long flags;
int ret = 0;
-- debug_rt_mutex_init_waiter(&waiter);
-- RB_CLEAR_NODE(&waiter.pi_tree_entry);
-- RB_CLEAR_NODE(&waiter.tree_entry);
+- rt_mutex_init_waiter(&waiter);
+ rt_mutex_init_waiter(&waiter, false);
/*
* Technically we could use raw_spin_[un]lock_irq() here, but this can
-@@ -1356,7 +1738,8 @@ static inline int rt_mutex_slowtrylock(s
+@@ -1360,7 +1743,8 @@ static inline int rt_mutex_slowtrylock(s
* Return whether the current task needs to undo a potential priority boosting.
*/
static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock,
@@ -2011,7 +2024,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
{
unsigned long flags;
-@@ -1412,7 +1795,7 @@ static bool __sched rt_mutex_slowunlock(
+@@ -1414,7 +1798,7 @@ static bool __sched rt_mutex_slowunlock(
*
* Queue the next waiter for wakeup once we release the wait_lock.
*/
@@ -2020,7 +2033,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
-@@ -1469,17 +1852,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lo
+@@ -1468,17 +1852,20 @@ rt_mutex_fasttrylock(struct rt_mutex *lo
static inline void
rt_mutex_fastunlock(struct rt_mutex *lock,
bool (*slowfn)(struct rt_mutex *lock,
@@ -2030,37 +2043,56 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
{
WAKE_Q(wake_q);
+ WAKE_Q(wake_sleeper_q);
+ bool deboost;
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
+ if (likely(rt_mutex_cmpxchg_release(lock, current, NULL)))
+ return;
- } else {
-- bool deboost = slowfn(lock, &wake_q);
-+ bool deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
+- deboost = slowfn(lock, &wake_q);
++ deboost = slowfn(lock, &wake_q, &wake_sleeper_q);
- wake_up_q(&wake_q);
-+ wake_up_q_sleeper(&wake_sleeper_q);
+ wake_up_q(&wake_q);
++ wake_up_q_sleeper(&wake_sleeper_q);
- /* Undo pi boosting if necessary: */
- if (deboost)
-@@ -1616,13 +2002,14 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
- * required or not.
+ /* Undo pi boosting if necessary: */
+ if (deboost)
+@@ -1606,7 +1993,8 @@ EXPORT_SYMBOL_GPL(rt_mutex_unlock);
+ * simple and will not need to retry.
*/
- bool __sched rt_mutex_futex_unlock(struct rt_mutex *lock,
-- struct wake_q_head *wqh)
-+ struct wake_q_head *wqh,
-+ struct wake_q_head *wq_sleeper)
+ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock,
+- struct wake_q_head *wake_q)
++ struct wake_q_head *wake_q,
++ struct wake_q_head *wq_sleeper)
{
- if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) {
- rt_mutex_deadlock_account_unlock(current);
- return false;
+ lockdep_assert_held(&lock->wait_lock);
+
+@@ -1617,21 +2005,23 @@ bool __sched __rt_mutex_futex_unlock(str
+ return false; /* done */
}
-- return rt_mutex_slowunlock(lock, wqh);
-+ return rt_mutex_slowunlock(lock, wqh, wq_sleeper);
+
+- mark_wakeup_next_waiter(wake_q, lock);
++ mark_wakeup_next_waiter(wake_q, wq_sleeper, lock);
+ return true; /* deboost and wakeups */
}
- /**
-@@ -1655,13 +2042,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
+ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock)
+ {
+ WAKE_Q(wake_q);
++ WAKE_Q(wake_sleeper_q);
+ bool deboost;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+- deboost = __rt_mutex_futex_unlock(lock, &wake_q);
++ deboost = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ if (deboost) {
+ wake_up_q(&wake_q);
++ wake_up_q_sleeper(&wake_sleeper_q);
+ rt_mutex_adjust_prio(current);
+ }
+ }
+@@ -1666,13 +2056,12 @@ EXPORT_SYMBOL_GPL(rt_mutex_destroy);
void __rt_mutex_init(struct rt_mutex *lock, const char *name)
{
lock->owner = NULL;
@@ -2075,7 +2107,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
/**
* rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a
-@@ -1676,7 +2062,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
+@@ -1687,7 +2076,7 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init);
void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
struct task_struct *proxy_owner)
{
@@ -2083,10 +2115,10 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+ rt_mutex_init(lock);
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
- rt_mutex_deadlock_account_lock(lock, proxy_owner);
-@@ -1838,3 +2224,25 @@ int rt_mutex_finish_proxy_lock(struct rt
+ }
+@@ -1893,3 +2282,25 @@ bool rt_mutex_cleanup_proxy_lock(struct
- return ret;
+ return cleanup;
}
+
+#ifdef CONFIG_PREEMPT_RT_FULL
@@ -2120,31 +2152,25 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
#ifdef CONFIG_DEBUG_RT_MUTEXES
unsigned long ip;
struct pid *deadlock_task_pid;
-@@ -114,7 +115,8 @@ extern int rt_mutex_finish_proxy_lock(st
- struct rt_mutex_waiter *waiter);
- extern int rt_mutex_timed_futex_lock(struct rt_mutex *l, struct hrtimer_sleeper *to);
- extern bool rt_mutex_futex_unlock(struct rt_mutex *lock,
-- struct wake_q_head *wqh);
-+ struct wake_q_head *wqh,
-+ struct wake_q_head *wq_sleeper);
- extern void rt_mutex_adjust_prio(struct task_struct *task);
+@@ -106,7 +107,7 @@ extern void rt_mutex_init_proxy_locked(s
+ struct task_struct *proxy_owner);
+ extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
+ struct task_struct *proxy_owner);
+-extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter);
++extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate);
+ extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+ struct rt_mutex_waiter *waiter,
+ struct task_struct *task);
+@@ -123,7 +124,8 @@ extern int rt_mutex_futex_trylock(struct
+
+ extern void rt_mutex_futex_unlock(struct rt_mutex *lock);
+ extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock,
+- struct wake_q_head *wqh);
++ struct wake_q_head *wqh,
++ struct wake_q_head *wq_sleeper);
- #ifdef CONFIG_DEBUG_RT_MUTEXES
-@@ -123,4 +125,14 @@ extern void rt_mutex_adjust_prio(struct
- # include "rtmutex.h"
- #endif
+ extern void rt_mutex_adjust_prio(struct task_struct *task);
-+static inline void
-+rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate)
-+{
-+ debug_rt_mutex_init_waiter(waiter);
-+ waiter->task = NULL;
-+ waiter->savestate = savestate;
-+ RB_CLEAR_NODE(&waiter->pi_tree_entry);
-+ RB_CLEAR_NODE(&waiter->tree_entry);
-+}
-+
- #endif
--- a/kernel/locking/spinlock.c
+++ b/kernel/locking/spinlock.c
@@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(loc
diff --git a/patches/rt-locking-Reenable-migration-accross-schedule.patch b/patches/rt-locking-Reenable-migration-accross-schedule.patch
index 9b386af9fead..7ef4dfeb89c2 100644
--- a/patches/rt-locking-Reenable-migration-accross-schedule.patch
+++ b/patches/rt-locking-Reenable-migration-accross-schedule.patch
@@ -18,7 +18,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
-@@ -988,14 +988,19 @@ static int __try_to_take_rt_mutex(struct
+@@ -986,14 +986,19 @@ static int __try_to_take_rt_mutex(struct
* preemptible spin_lock functions:
*/
static inline void rt_spin_lock_fastlock(struct rt_mutex *lock,
@@ -33,14 +33,14 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ migrate_disable();
+
if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
- rt_mutex_deadlock_account_lock(lock, current);
+ return;
else
- slowfn(lock);
+ slowfn(lock, do_mig_dis);
}
- static inline int rt_spin_lock_fastunlock(struct rt_mutex *lock,
-@@ -1054,7 +1059,8 @@ static int task_blocks_on_rt_mutex(struc
+ static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock,
+@@ -1051,7 +1056,8 @@ static int task_blocks_on_rt_mutex(struc
* We store the current state under p->pi_lock in p->saved_state and
* the try_to_wake_up() code handles this accordingly.
*/
@@ -50,7 +50,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
{
struct task_struct *lock_owner, *self = current;
struct rt_mutex_waiter waiter, *top_waiter;
-@@ -1098,8 +1104,13 @@ static void noinline __sched rt_spin_lo
+@@ -1095,8 +1101,13 @@ static void noinline __sched rt_spin_lo
debug_rt_mutex_print_deadlock(&waiter);
@@ -65,7 +65,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
raw_spin_lock_irqsave(&lock->wait_lock, flags);
-@@ -1197,38 +1208,35 @@ static int noinline __sched rt_spin_lock
+@@ -1165,38 +1176,35 @@ static void noinline __sched rt_spin_lo
void __lockfunc rt_spin_lock__no_mg(spinlock_t *lock)
{
diff --git a/patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch b/patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch
index b05524f25aeb..bbb8795771ae 100644
--- a/patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch
+++ b/patches/rtmutex--Handle-non-enqueued-waiters-gracefully.patch
@@ -21,7 +21,7 @@ Cc: stable-rt@vger.kernel.org
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
-@@ -1690,7 +1690,7 @@ int rt_mutex_start_proxy_lock(struct rt_
+@@ -1682,7 +1682,7 @@ int __rt_mutex_start_proxy_lock(struct r
ret = 0;
}
@@ -29,4 +29,4 @@ Cc: stable-rt@vger.kernel.org
+ if (ret && rt_mutex_has_waiters(lock))
remove_waiter(lock, waiter);
- raw_spin_unlock_irq(&lock->wait_lock);
+ debug_rt_mutex_print_deadlock(waiter);
diff --git a/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch b/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch
index 56afc2458734..68142ad38c64 100644
--- a/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch
+++ b/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch
@@ -22,8 +22,8 @@ lockdep says:
Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
---
- kernel/locking/rtmutex.c | 251 ++++++++++++++++++++++++++++++++++++++++++-----
- 1 file changed, 226 insertions(+), 25 deletions(-)
+ kernel/locking/rtmutex.c | 248 ++++++++++++++++++++++++++++++++++++++++++-----
+ 1 file changed, 224 insertions(+), 24 deletions(-)
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -35,7 +35,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
#include "rtmutex_common.h"
-@@ -1360,6 +1361,40 @@ EXPORT_SYMBOL(__rt_spin_lock_init);
+@@ -1317,6 +1318,40 @@ EXPORT_SYMBOL(__rt_spin_lock_init);
#endif /* PREEMPT_RT_FULL */
@@ -76,7 +76,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
static inline int
try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task,
struct rt_mutex_waiter *waiter)
-@@ -1614,7 +1649,8 @@ void rt_mutex_adjust_pi(struct task_stru
+@@ -1580,7 +1615,8 @@ void rt_mutex_init_waiter(struct rt_mute
static int __sched
__rt_mutex_slowlock(struct rt_mutex *lock, int state,
struct hrtimer_sleeper *timeout,
@@ -86,7 +86,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
{
int ret = 0;
-@@ -1637,6 +1673,12 @@ static int __sched
+@@ -1603,6 +1639,12 @@ static int __sched
break;
}
@@ -99,7 +99,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
raw_spin_unlock_irq(&lock->wait_lock);
debug_rt_mutex_print_deadlock(waiter);
-@@ -1671,13 +1713,90 @@ static void rt_mutex_handle_deadlock(int
+@@ -1637,13 +1679,90 @@ static void rt_mutex_handle_deadlock(int
}
}
@@ -191,7 +191,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
{
struct rt_mutex_waiter waiter;
unsigned long flags;
-@@ -1697,6 +1816,8 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+@@ -1663,6 +1782,8 @@ rt_mutex_slowlock(struct rt_mutex *lock,
/* Try to acquire the lock again: */
if (try_to_take_rt_mutex(lock, current, NULL)) {
@@ -200,7 +200,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
return 0;
}
-@@ -1711,13 +1832,23 @@ rt_mutex_slowlock(struct rt_mutex *lock,
+@@ -1677,13 +1798,23 @@ rt_mutex_slowlock(struct rt_mutex *lock,
if (likely(!ret))
/* sleep on the mutex */
@@ -226,7 +226,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
}
/*
-@@ -1850,31 +1981,36 @@ static bool __sched rt_mutex_slowunlock(
+@@ -1814,29 +1945,33 @@ static bool __sched rt_mutex_slowunlock(
*/
static inline int
rt_mutex_fastlock(struct rt_mutex *lock, int state,
@@ -237,13 +237,11 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
+ enum rtmutex_chainwalk chwalk,
+ struct ww_acquire_ctx *ww_ctx))
{
- if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
-- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
-+ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK,
-+ ww_ctx);
+
+- return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK);
++ return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx);
}
static inline int
@@ -258,16 +256,15 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
+ struct ww_acquire_ctx *ww_ctx))
{
if (chwalk == RT_MUTEX_MIN_CHAINWALK &&
- likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) {
- rt_mutex_deadlock_account_lock(lock, current);
+ likely(rt_mutex_cmpxchg_acquire(lock, NULL, current)))
return 0;
- } else
-- return slowfn(lock, state, timeout, chwalk);
-+ return slowfn(lock, state, timeout, chwalk, ww_ctx);
+
+- return slowfn(lock, state, timeout, chwalk);
++ return slowfn(lock, state, timeout, chwalk, ww_ctx);
}
static inline int
-@@ -1921,7 +2057,7 @@ void __sched rt_mutex_lock(struct rt_mut
+@@ -1881,7 +2016,7 @@ void __sched rt_mutex_lock(struct rt_mut
{
might_sleep();
@@ -276,7 +273,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
}
EXPORT_SYMBOL_GPL(rt_mutex_lock);
-@@ -1938,7 +2074,7 @@ int __sched rt_mutex_lock_interruptible(
+@@ -1898,7 +2033,7 @@ int __sched rt_mutex_lock_interruptible(
{
might_sleep();
@@ -285,16 +282,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
-@@ -1951,7 +2087,7 @@ int rt_mutex_timed_futex_lock(struct rt_
- might_sleep();
-
- return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
-- RT_MUTEX_FULL_CHAINWALK,
-+ RT_MUTEX_FULL_CHAINWALK, NULL,
- rt_mutex_slowlock);
- }
-
-@@ -1970,7 +2106,7 @@ int __sched rt_mutex_lock_killable(struc
+@@ -1925,7 +2060,7 @@ int __sched rt_mutex_lock_killable(struc
{
might_sleep();
@@ -303,7 +291,7 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
}
EXPORT_SYMBOL_GPL(rt_mutex_lock_killable);
-@@ -1994,6 +2130,7 @@ rt_mutex_timed_lock(struct rt_mutex *loc
+@@ -1949,6 +2084,7 @@ rt_mutex_timed_lock(struct rt_mutex *loc
return rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout,
RT_MUTEX_MIN_CHAINWALK,
@@ -311,17 +299,17 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
-@@ -2248,7 +2385,7 @@ int rt_mutex_finish_proxy_lock(struct rt
+@@ -2230,7 +2366,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m
set_current_state(TASK_INTERRUPTIBLE);
/* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
- if (unlikely(ret))
- remove_waiter(lock, waiter);
-@@ -2264,24 +2401,88 @@ int rt_mutex_finish_proxy_lock(struct rt
- return ret;
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+@@ -2283,24 +2419,88 @@ bool rt_mutex_cleanup_proxy_lock(struct
+ return cleanup;
}
-#ifdef CONFIG_PREEMPT_RT_FULL
diff --git a/patches/rtmutex-futex-prepare-rt.patch b/patches/rtmutex-futex-prepare-rt.patch
index 3d966f0febbe..6186521366c1 100644
--- a/patches/rtmutex-futex-prepare-rt.patch
+++ b/patches/rtmutex-futex-prepare-rt.patch
@@ -15,7 +15,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- a/kernel/futex.c
+++ b/kernel/futex.c
-@@ -1924,6 +1924,16 @@ static int futex_requeue(u32 __user *uad
+@@ -2009,6 +2009,16 @@ static int futex_requeue(u32 __user *uad
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
@@ -32,16 +32,16 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
} else if (ret) {
/*
* rt_mutex_start_proxy_lock() detected a
-@@ -2813,7 +2823,7 @@ static int futex_wait_requeue_pi(u32 __u
- {
+@@ -2992,7 +3002,7 @@ static int futex_wait_requeue_pi(u32 __u
struct hrtimer_sleeper timeout, *to = NULL;
+ struct futex_pi_state *pi_state = NULL;
struct rt_mutex_waiter rt_waiter;
- struct futex_hash_bucket *hb;
+ struct futex_hash_bucket *hb, *hb2;
union futex_key key2 = FUTEX_KEY_INIT;
struct futex_q q = futex_q_init;
int res, ret;
-@@ -2872,20 +2882,55 @@ static int futex_wait_requeue_pi(u32 __u
+@@ -3048,20 +3058,55 @@ static int futex_wait_requeue_pi(u32 __u
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
futex_wait_queue_me(hb, &q, to);
@@ -108,7 +108,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
/* Check if the requeue code acquired the second futex for us. */
if (!q.rt_waiter) {
-@@ -2894,7 +2939,8 @@ static int futex_wait_requeue_pi(u32 __u
+@@ -3070,7 +3115,8 @@ static int futex_wait_requeue_pi(u32 __u
* did a lock-steal - fix up the PI-state in that case.
*/
if (q.pi_state && (q.pi_state->owner != current)) {
@@ -116,9 +116,9 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+ spin_lock(&hb2->lock);
+ BUG_ON(&hb2->lock != q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
- if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
- rt_mutex_unlock(&q.pi_state->pi_mutex);
-@@ -2903,7 +2949,7 @@ static int futex_wait_requeue_pi(u32 __u
+ if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+ pi_state = q.pi_state;
+@@ -3081,7 +3127,7 @@ static int futex_wait_requeue_pi(u32 __u
* the requeue_pi() code acquired for us.
*/
put_pi_state(q.pi_state);
@@ -127,16 +127,16 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
} else {
struct rt_mutex *pi_mutex;
-@@ -2918,7 +2964,8 @@ static int futex_wait_requeue_pi(u32 __u
- ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter);
- debug_rt_mutex_free_waiter(&rt_waiter);
+@@ -3095,7 +3141,8 @@ static int futex_wait_requeue_pi(u32 __u
+ pi_mutex = &q.pi_state->pi_mutex;
+ ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter);
- spin_lock(q.lock_ptr);
+ spin_lock(&hb2->lock);
+ BUG_ON(&hb2->lock != q.lock_ptr);
- /*
- * Fixup the pi_state owner and possibly acquire the lock if we
- * haven't already.
+ if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter))
+ ret = 0;
+
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -133,6 +133,11 @@ static void fixup_rt_mutex_waiters(struc
@@ -170,7 +170,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
goto out_unlock_pi;
/*
-@@ -971,6 +977,23 @@ static int task_blocks_on_rt_mutex(struc
+@@ -969,6 +975,23 @@ static int task_blocks_on_rt_mutex(struc
return -EDEADLK;
raw_spin_lock(&task->pi_lock);
@@ -194,7 +194,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
__rt_mutex_adjust_prio(task);
waiter->task = task;
waiter->lock = lock;
-@@ -994,7 +1017,7 @@ static int task_blocks_on_rt_mutex(struc
+@@ -992,7 +1015,7 @@ static int task_blocks_on_rt_mutex(struc
rt_mutex_enqueue_pi(owner, waiter);
__rt_mutex_adjust_prio(owner);
@@ -203,7 +203,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
chain_walk = 1;
} else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) {
chain_walk = 1;
-@@ -1078,7 +1101,7 @@ static void remove_waiter(struct rt_mute
+@@ -1076,7 +1099,7 @@ static void remove_waiter(struct rt_mute
{
bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock));
struct task_struct *owner = rt_mutex_owner(lock);
@@ -212,7 +212,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
raw_spin_lock(&current->pi_lock);
rt_mutex_dequeue(lock, waiter);
-@@ -1102,7 +1125,8 @@ static void remove_waiter(struct rt_mute
+@@ -1100,7 +1123,8 @@ static void remove_waiter(struct rt_mute
__rt_mutex_adjust_prio(owner);
/* Store the lock on which owner is blocked or NULL */
@@ -222,7 +222,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
raw_spin_unlock(&owner->pi_lock);
-@@ -1138,7 +1162,7 @@ void rt_mutex_adjust_pi(struct task_stru
+@@ -1136,7 +1160,7 @@ void rt_mutex_adjust_pi(struct task_stru
raw_spin_lock_irqsave(&task->pi_lock, flags);
waiter = task->pi_blocked_on;
diff --git a/patches/rtmutex-lock-killable.patch b/patches/rtmutex-lock-killable.patch
index ac46c07b973b..1d8a14060569 100644
--- a/patches/rtmutex-lock-killable.patch
+++ b/patches/rtmutex-lock-killable.patch
@@ -23,7 +23,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
-@@ -1531,6 +1531,25 @@ int rt_mutex_timed_futex_lock(struct rt_
+@@ -1524,6 +1524,25 @@ int __sched rt_mutex_futex_trylock(struc
}
/**
diff --git a/patches/rtmutex-trylock-is-okay-on-RT.patch b/patches/rtmutex-trylock-is-okay-on-RT.patch
index 1aa6206dbc90..e87897594e44 100644
--- a/patches/rtmutex-trylock-is-okay-on-RT.patch
+++ b/patches/rtmutex-trylock-is-okay-on-RT.patch
@@ -13,7 +13,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
-@@ -1542,7 +1542,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
+@@ -1535,7 +1535,11 @@ EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
*/
int __sched rt_mutex_trylock(struct rt_mutex *lock)
{
diff --git a/patches/series b/patches/series
index 3766a2fbe2dc..137fcb9ca0f8 100644
--- a/patches/series
+++ b/patches/series
@@ -17,6 +17,21 @@ timer-make-the-base-lock-raw.patch
############################################################
lockdep-Handle-statically-initialized-PER_CPU-locks-.patch
lockdep-Fix-compilation-error-for-CONFIG_MODULES-and.patch
+lockdep-Fix-per-cpu-static-objects.patch
+
+0001-futex-Cleanup-variable-names-for-futex_top_waiter.patch
+0002-futex-Use-smp_store_release-in-mark_wake_futex.patch
+0003-futex-Remove-rt_mutex_deadlock_account_.patch
+0004-futex-rt_mutex-Provide-futex-specific-rt_mutex-API.patch
+0005-futex-Change-locking-rules.patch
+0006-futex-Cleanup-refcounting.patch
+0007-futex-Rework-inconsistent-rt_mutex-futex_q-state.patch
+0008-futex-Pull-rt_mutex_futex_unlock-out-from-under-hb-l.patch
+0009-futex-rt_mutex-Introduce-rt_mutex_init_waiter.patch
+0010-futex-rt_mutex-Restructure-rt_mutex_finish_proxy_loc.patch
+0011-futex-Rework-futex_lock_pi-to-use-rt_mutex_-_proxy_l.patch
+0012-futex-Futex_unlock_pi-determinism.patch
+0013-futex-Drop-hb-lock-before-enqueueing-on-the-rtmutex.patch
# Those two should vanish soon (not use PIT during bootup)
at91_dont_enable_disable_clock.patch
@@ -183,6 +198,7 @@ preempt-nort-rt-variants.patch
# local locks & migrate disable
introduce_migrate_disable_cpu_light.patch
+futex-workaround-migrate_disable-enable-in-different.patch
rt-local-irq-lock.patch
locallock-add-local_lock_on.patch
@@ -333,7 +349,6 @@ rtmutex-avoid-include-hell.patch
rtmutex_dont_include_rcu.patch
rt-add-rt-locks.patch
rt-drop_mutex_disable_on_not_debug.patch
-kernel-futex-don-t-deboost-too-early.patch
rtmutex-add-a-first-shot-of-ww_mutex.patch
ptrace-fix-ptrace-vs-tasklist_lock-race.patch