summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorSebastian Andrzej Siewior <bigeasy@linutronix.de>2017-05-27 10:04:35 +0200
committerSebastian Andrzej Siewior <bigeasy@linutronix.de>2017-05-27 10:04:35 +0200
commit92747649c94d4d6b1cf3ee8b24caafe8c8c019e3 (patch)
tree5cf411b8809ad338ff660835611041b21f92a509
parent67c6ffc95dc19132a0b3e4b672b66f604bc7c5bf (diff)
downloadlinux-rt-4.9.30-rt20-patches.tar.gz
[ANNOUNCE] v4.9.30-rt20v4.9.30-rt20-patches
Dear RT folks! I'm pleased to announce the v4.9.30-rt20 patch set. Changes since v4.9.30-rt19: - The patch "timers: Don't wake ktimersoftd on every tick" has been reverted because this optimisation can lead to timers not getting expired. Reported by Klaus Gusenleitner, debugged by Anna-Maria Gleixner. - Markus Trippelsdorf reported that the new futex code makes the glibc/nptl/tst-robustpi8 test fail. Patch by Peter Zijlstra. Known issues - CPU hotplug got a little better but can deadlock. - gdb. While gdb is following a task it is possible that after a fork() operation the task is waiting for gdb and gdb waiting for the task. The delta patch against v4.9.30-rt19 is appended below and can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/incr/patch-4.9.30-rt19-rt20.patch.xz You can get this release via the git tree at: git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v4.9.30-rt20 The RT patch against v4.9.30 can be found here: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patch-4.9.30-rt20.patch.xz The split quilt queue is available at: https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patches-4.9.30-rt20.tar.xz Sebastian diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -2400,11 +2400,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, int ret; raw_spin_lock_irq(&lock->wait_lock); - - set_current_state(TASK_INTERRUPTIBLE); - /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); /* * RT has a problem here when the wait got interrupted by a timeout @@ -2423,7 +2426,6 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, tsk->pi_blocked_on = NULL; raw_spin_unlock(&tsk->pi_lock); } - raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -2455,15 +2457,25 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); /* + * Do an unconditional try-lock, this deals with the lock stealing + * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() + * sets a NULL owner. + * + * We're not interested in the return value, because the subsequent + * test on rt_mutex_owner() will infer that. If the trylock succeeded, + * we will own the lock and it will have removed the waiter. If we + * failed the trylock, we're still not owner and we need to remove + * ourselves. + */ + try_to_take_rt_mutex(lock, current, waiter); + /* * Unless we're the owner; we're still enqueued on the wait_list. * So check if we became owner, if not, take us off the wait_list. */ if (rt_mutex_owner(lock) != current) { remove_waiter(lock, waiter); - fixup_rt_mutex_waiters(lock); cleanup = true; } - /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. diff --git a/kernel/time/timer.c b/kernel/time/timer.c --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -206,8 +206,6 @@ struct timer_base { bool is_idle; DECLARE_BITMAP(pending_map, WHEEL_SIZE); struct hlist_head vectors[WHEEL_SIZE]; - struct hlist_head expired_lists[LVL_DEPTH]; - int expired_count; } ____cacheline_aligned; static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); @@ -1355,8 +1353,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long), } } -static inline void __expire_timers(struct timer_base *base, - struct hlist_head *head) +static void expire_timers(struct timer_base *base, struct hlist_head *head) { while (!hlist_empty(head)) { struct timer_list *timer; @@ -1387,38 +1384,21 @@ static inline void __expire_timers(struct timer_base *base, } } -static void expire_timers(struct timer_base *base) -{ - struct hlist_head *head; - - while (base->expired_count--) { - head = base->expired_lists + base->expired_count; - __expire_timers(base, head); - } - base->expired_count = 0; -} - -static void __collect_expired_timers(struct timer_base *base) +static int __collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { unsigned long clk = base->clk; struct hlist_head *vec; - int i; + int i, levels = 0; unsigned int idx; - /* - * expire_timers() must be called at least once before we can - * collect more timers - */ - if (WARN_ON(base->expired_count)) - return; - for (i = 0; i < LVL_DEPTH; i++) { idx = (clk & LVL_MASK) + i * LVL_SIZE; if (__test_and_clear_bit(idx, base->pending_map)) { vec = base->vectors + idx; - hlist_move_list(vec, - &base->expired_lists[base->expired_count++]); + hlist_move_list(vec, heads++); + levels++; } /* Is it time to look at the next level? */ if (clk & LVL_CLK_MASK) @@ -1426,6 +1406,7 @@ static void __collect_expired_timers(struct timer_base *base) /* Shift clock for the next level granularity */ clk >>= LVL_CLK_SHIFT; } + return levels; } #ifdef CONFIG_NO_HZ_COMMON @@ -1618,7 +1599,8 @@ void timer_clear_idle(void) base->is_idle = false; } -static void collect_expired_timers(struct timer_base *base) +static int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { /* * NOHZ optimization. After a long idle sleep we need to forward the @@ -1635,49 +1617,20 @@ static void collect_expired_timers(struct timer_base *base) if (time_after(next, jiffies)) { /* The call site will increment clock! */ base->clk = jiffies - 1; - return; + return 0; } base->clk = next; } - __collect_expired_timers(base); + return __collect_expired_timers(base, heads); } #else -static inline void collect_expired_timers(struct timer_base *base) +static inline int collect_expired_timers(struct timer_base *base, + struct hlist_head *heads) { - __collect_expired_timers(base); + return __collect_expired_timers(base, heads); } #endif -static int find_expired_timers(struct timer_base *base) -{ - const unsigned long int end_clk = jiffies; - - while (!base->expired_count && time_after_eq(end_clk, base->clk)) { - collect_expired_timers(base); - base->clk++; - } - - return base->expired_count; -} - -/* Called from CPU tick routine to quickly collect expired timers */ -static int tick_find_expired(struct timer_base *base) -{ - int count; - - raw_spin_lock(&base->lock); - - if (unlikely(time_after(jiffies, base->clk + HZ))) { - /* defer to ktimersoftd; don't spend too long in irq context */ - count = -1; - } else - count = find_expired_timers(base); - - raw_spin_unlock(&base->lock); - - return count; -} - /* * Called from the timer interrupt handler to charge one tick to the current * process. user_tick is 1 if the tick is user time, 0 for system. @@ -1704,11 +1657,22 @@ void update_process_times(int user_tick) */ static inline void __run_timers(struct timer_base *base) { + struct hlist_head heads[LVL_DEPTH]; + int levels; + + if (!time_after_eq(jiffies, base->clk)) + return; + raw_spin_lock_irq(&base->lock); - while (find_expired_timers(base)) - expire_timers(base); + while (time_after_eq(jiffies, base->clk)) { + levels = collect_expired_timers(base, heads); + base->clk++; + + while (levels--) + expire_timers(base, heads + levels); + } raw_spin_unlock_irq(&base->lock); wakeup_timer_waiters(base); } @@ -1736,12 +1700,12 @@ void run_local_timers(void) hrtimer_run_queues(); /* Raise the softirq only if required. */ - if (time_before(jiffies, base->clk) || !tick_find_expired(base)) { + if (time_before(jiffies, base->clk)) { if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) return; /* CPU is awake, so check the deferrable base. */ base++; - if (time_before(jiffies, base->clk) || !tick_find_expired(base)) + if (time_before(jiffies, base->clk)) return; } raise_softirq(TIMER_SOFTIRQ); @@ -1911,7 +1875,6 @@ int timers_dead_cpu(unsigned int cpu) raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); BUG_ON(old_base->running_timer); - BUG_ON(old_base->expired_count); for (i = 0; i < WHEEL_SIZE; i++) migrate_timer_list(new_base, old_base->vectors + i); @@ -1938,7 +1901,6 @@ static void __init init_timer_cpu(int cpu) #ifdef CONFIG_PREEMPT_RT_FULL init_swait_queue_head(&base->wait_for_running_timer); #endif - base->expired_count = 0; } } diff --git a/localversion-rt b/localversion-rt --- a/localversion-rt +++ b/localversion-rt @@ -1 +1 @@ --rt19 +-rt20 Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-rw-r--r--patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch4
-rw-r--r--patches/Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch217
-rw-r--r--patches/futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch125
-rw-r--r--patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch10
-rw-r--r--patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch4
-rw-r--r--patches/irqwork-Move-irq-safe-work-to-irq-context.patch4
-rw-r--r--patches/irqwork-push_most_work_into_softirq_context.patch4
-rw-r--r--patches/localversion.patch2
-rw-r--r--patches/md-raid5-percpu-handling-rt-aware.patch4
-rw-r--r--patches/mips-disable-highmem-on-rt.patch2
-rw-r--r--patches/mm-convert-swap-to-percpu-locked.patch2
-rw-r--r--patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch6
-rw-r--r--patches/rt-add-rt-locks.patch2
-rw-r--r--patches/rtmutex-add-a-first-shot-of-ww_mutex.patch14
-rw-r--r--patches/series2
-rw-r--r--patches/x86-kvm-require-const-tsc-for-rt.patch2
16 files changed, 374 insertions, 30 deletions
diff --git a/patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch b/patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch
index 47c94bdc0da4..28b3dc4c6424 100644
--- a/patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch
+++ b/patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch
@@ -57,7 +57,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
-@@ -2695,7 +2695,7 @@ static int _nfs4_open_and_get_state(stru
+@@ -2697,7 +2697,7 @@ static int _nfs4_open_and_get_state(stru
unsigned int seq;
int ret;
@@ -66,7 +66,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
ret = _nfs4_proc_open(opendata);
if (ret != 0)
-@@ -2733,7 +2733,7 @@ static int _nfs4_open_and_get_state(stru
+@@ -2735,7 +2735,7 @@ static int _nfs4_open_and_get_state(stru
if (d_inode(dentry) == state->inode) {
nfs_inode_attach_open_context(ctx);
diff --git a/patches/Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch b/patches/Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch
new file mode 100644
index 000000000000..3fa509c360c5
--- /dev/null
+++ b/patches/Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch
@@ -0,0 +1,217 @@
+From 16145f9c01a2e671aceb731050de9fbf977d31d0 Mon Sep 17 00:00:00 2001
+From: Anna-Maria Gleixner <anna-maria@linutronix.de>
+Date: Fri, 26 May 2017 19:16:07 +0200
+Subject: [PATCH] Revert "timers: Don't wake ktimersoftd on every tick"
+
+This reverts commit 032f93cae150a ("timers: Don't wake ktimersoftd on
+every tick").
+
+The problem is that the look ahead optimization from the tick timer
+interrupt context can race with the softirq thread expiring timer. As
+a consequence the temporary hlist heads which hold the to expire
+timers are overwritten and the timers which are already removed from
+the wheel bucket for expiry are now dangling w/o a list head.
+
+That means those timers never get expired. If one of those timers is
+canceled the removal operation will result in a hlist corruption.
+
+Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/time/timer.c | 96 +++++++++++++++-------------------------------------
+ 1 file changed, 29 insertions(+), 67 deletions(-)
+
+--- a/kernel/time/timer.c
++++ b/kernel/time/timer.c
+@@ -206,8 +206,6 @@ struct timer_base {
+ bool is_idle;
+ DECLARE_BITMAP(pending_map, WHEEL_SIZE);
+ struct hlist_head vectors[WHEEL_SIZE];
+- struct hlist_head expired_lists[LVL_DEPTH];
+- int expired_count;
+ } ____cacheline_aligned;
+
+ static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
+@@ -1355,8 +1353,7 @@ static void call_timer_fn(struct timer_l
+ }
+ }
+
+-static inline void __expire_timers(struct timer_base *base,
+- struct hlist_head *head)
++static void expire_timers(struct timer_base *base, struct hlist_head *head)
+ {
+ while (!hlist_empty(head)) {
+ struct timer_list *timer;
+@@ -1387,38 +1384,21 @@ static inline void __expire_timers(struc
+ }
+ }
+
+-static void expire_timers(struct timer_base *base)
+-{
+- struct hlist_head *head;
+-
+- while (base->expired_count--) {
+- head = base->expired_lists + base->expired_count;
+- __expire_timers(base, head);
+- }
+- base->expired_count = 0;
+-}
+-
+-static void __collect_expired_timers(struct timer_base *base)
++static int __collect_expired_timers(struct timer_base *base,
++ struct hlist_head *heads)
+ {
+ unsigned long clk = base->clk;
+ struct hlist_head *vec;
+- int i;
++ int i, levels = 0;
+ unsigned int idx;
+
+- /*
+- * expire_timers() must be called at least once before we can
+- * collect more timers
+- */
+- if (WARN_ON(base->expired_count))
+- return;
+-
+ for (i = 0; i < LVL_DEPTH; i++) {
+ idx = (clk & LVL_MASK) + i * LVL_SIZE;
+
+ if (__test_and_clear_bit(idx, base->pending_map)) {
+ vec = base->vectors + idx;
+- hlist_move_list(vec,
+- &base->expired_lists[base->expired_count++]);
++ hlist_move_list(vec, heads++);
++ levels++;
+ }
+ /* Is it time to look at the next level? */
+ if (clk & LVL_CLK_MASK)
+@@ -1426,6 +1406,7 @@ static void __collect_expired_timers(str
+ /* Shift clock for the next level granularity */
+ clk >>= LVL_CLK_SHIFT;
+ }
++ return levels;
+ }
+
+ #ifdef CONFIG_NO_HZ_COMMON
+@@ -1618,7 +1599,8 @@ void timer_clear_idle(void)
+ base->is_idle = false;
+ }
+
+-static void collect_expired_timers(struct timer_base *base)
++static int collect_expired_timers(struct timer_base *base,
++ struct hlist_head *heads)
+ {
+ /*
+ * NOHZ optimization. After a long idle sleep we need to forward the
+@@ -1635,49 +1617,20 @@ static void collect_expired_timers(struc
+ if (time_after(next, jiffies)) {
+ /* The call site will increment clock! */
+ base->clk = jiffies - 1;
+- return;
++ return 0;
+ }
+ base->clk = next;
+ }
+- __collect_expired_timers(base);
++ return __collect_expired_timers(base, heads);
+ }
+ #else
+-static inline void collect_expired_timers(struct timer_base *base)
++static inline int collect_expired_timers(struct timer_base *base,
++ struct hlist_head *heads)
+ {
+- __collect_expired_timers(base);
++ return __collect_expired_timers(base, heads);
+ }
+ #endif
+
+-static int find_expired_timers(struct timer_base *base)
+-{
+- const unsigned long int end_clk = jiffies;
+-
+- while (!base->expired_count && time_after_eq(end_clk, base->clk)) {
+- collect_expired_timers(base);
+- base->clk++;
+- }
+-
+- return base->expired_count;
+-}
+-
+-/* Called from CPU tick routine to quickly collect expired timers */
+-static int tick_find_expired(struct timer_base *base)
+-{
+- int count;
+-
+- raw_spin_lock(&base->lock);
+-
+- if (unlikely(time_after(jiffies, base->clk + HZ))) {
+- /* defer to ktimersoftd; don't spend too long in irq context */
+- count = -1;
+- } else
+- count = find_expired_timers(base);
+-
+- raw_spin_unlock(&base->lock);
+-
+- return count;
+-}
+-
+ /*
+ * Called from the timer interrupt handler to charge one tick to the current
+ * process. user_tick is 1 if the tick is user time, 0 for system.
+@@ -1704,11 +1657,22 @@ void update_process_times(int user_tick)
+ */
+ static inline void __run_timers(struct timer_base *base)
+ {
++ struct hlist_head heads[LVL_DEPTH];
++ int levels;
++
++ if (!time_after_eq(jiffies, base->clk))
++ return;
++
+ raw_spin_lock_irq(&base->lock);
+
+- while (find_expired_timers(base))
+- expire_timers(base);
++ while (time_after_eq(jiffies, base->clk)) {
++
++ levels = collect_expired_timers(base, heads);
++ base->clk++;
+
++ while (levels--)
++ expire_timers(base, heads + levels);
++ }
+ raw_spin_unlock_irq(&base->lock);
+ wakeup_timer_waiters(base);
+ }
+@@ -1734,12 +1698,12 @@ void run_local_timers(void)
+
+ hrtimer_run_queues();
+ /* Raise the softirq only if required. */
+- if (time_before(jiffies, base->clk) || !tick_find_expired(base)) {
++ if (time_before(jiffies, base->clk)) {
+ if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
+ return;
+ /* CPU is awake, so check the deferrable base. */
+ base++;
+- if (time_before(jiffies, base->clk) || !tick_find_expired(base))
++ if (time_before(jiffies, base->clk))
+ return;
+ }
+ raise_softirq(TIMER_SOFTIRQ);
+@@ -1909,7 +1873,6 @@ int timers_dead_cpu(unsigned int cpu)
+ raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
+
+ BUG_ON(old_base->running_timer);
+- BUG_ON(old_base->expired_count);
+
+ for (i = 0; i < WHEEL_SIZE; i++)
+ migrate_timer_list(new_base, old_base->vectors + i);
+@@ -1936,7 +1899,6 @@ static void __init init_timer_cpu(int cp
+ #ifdef CONFIG_PREEMPT_RT_FULL
+ init_swait_queue_head(&base->wait_for_running_timer);
+ #endif
+- base->expired_count = 0;
+ }
+ }
+
diff --git a/patches/futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch b/patches/futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch
new file mode 100644
index 000000000000..21b716ce5196
--- /dev/null
+++ b/patches/futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch
@@ -0,0 +1,125 @@
+From: Peter Zijlstra <peterz@infradead.org>
+Date: Mon, 22 May 2017 13:04:50 -0700
+Subject: [PATCH] futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock()
+
+Markus reported that the glibc/nptl/tst-robustpi8 test was failing after
+commit:
+
+ cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()")
+
+The following trace shows the problem:
+
+ ld-linux-x86-64-2161 [019] .... 410.760971: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_LOCK_PI
+ ld-linux-x86-64-2161 [019] ...1 410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0
+ ld-linux-x86-64-2165 [011] .... 410.760978: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_UNLOCK_PI
+ ld-linux-x86-64-2165 [011] d..1 410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0
+ ld-linux-x86-64-2165 [011] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000
+ ld-linux-x86-64-2161 [019] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT
+
+Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161
+which then returns with -ETIMEDOUT. That wrecks the lock state, because now
+the owner isn't aware it acquired the lock and removes the pending robust
+list entry.
+
+If 2161 is killed, the robust list will not clear out this futex and the
+subsequent acquire on this futex will then (correctly) result in -ESRCH
+which is unexpected by glibc, triggers an internal assertion and dies.
+
+Task 2161 Task 2165
+
+rt_mutex_wait_proxy_lock()
+ timeout();
+ /* T2161 is still queued in the waiter list */
+ return -ETIMEDOUT;
+
+ futex_unlock_pi()
+ spin_lock(hb->lock);
+ rtmutex_unlock()
+ remove_rtmutex_waiter(T2161);
+ mark_lock_available();
+ /* Make the next waiter owner of the user space side */
+ futex_uval = 2161;
+ spin_unlock(hb->lock);
+spin_lock(hb->lock);
+rt_mutex_cleanup_proxy_lock()
+ if (rtmutex_owner() !== current)
+ ...
+ return FAIL;
+....
+return -ETIMEOUT;
+
+This means that rt_mutex_cleanup_proxy_lock() needs to call
+try_to_take_rt_mutex() so it can take over the rtmutex correctly which was
+assigned by the waker. If the rtmutex is owned by some other task then this
+call is harmless and just confirmes that the waiter is not able to acquire
+it.
+
+While there, fix what looks like a merge error which resulted in
+rt_mutex_cleanup_proxy_lock() having two calls to
+fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any.
+Both should have one, since both potentially touch the waiter list.
+
+Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()")
+Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de>
+Bug-Spotted-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: Florian Weimer <fweimer@redhat.com>
+Cc: Darren Hart <dvhart@infradead.org>
+Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
+Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+---
+ kernel/locking/rtmutex.c | 24 ++++++++++++++++++------
+ 1 file changed, 18 insertions(+), 6 deletions(-)
+
+--- a/kernel/locking/rtmutex.c
++++ b/kernel/locking/rtmutex.c
+@@ -1775,12 +1775,14 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+ int ret;
+
+ raw_spin_lock_irq(&lock->wait_lock);
+-
+- set_current_state(TASK_INTERRUPTIBLE);
+-
+ /* sleep on the mutex */
++ set_current_state(TASK_INTERRUPTIBLE);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+-
++ /*
++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
++ * have to fix that up.
++ */
++ fixup_rt_mutex_waiters(lock);
+ raw_spin_unlock_irq(&lock->wait_lock);
+
+ return ret;
+@@ -1812,15 +1814,25 @@ bool rt_mutex_cleanup_proxy_lock(struct
+
+ raw_spin_lock_irq(&lock->wait_lock);
+ /*
++ * Do an unconditional try-lock, this deals with the lock stealing
++ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
++ * sets a NULL owner.
++ *
++ * We're not interested in the return value, because the subsequent
++ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
++ * we will own the lock and it will have removed the waiter. If we
++ * failed the trylock, we're still not owner and we need to remove
++ * ourselves.
++ */
++ try_to_take_rt_mutex(lock, current, waiter);
++ /*
+ * Unless we're the owner; we're still enqueued on the wait_list.
+ * So check if we became owner, if not, take us off the wait_list.
+ */
+ if (rt_mutex_owner(lock) != current) {
+ remove_waiter(lock, waiter);
+- fixup_rt_mutex_waiters(lock);
+ cleanup = true;
+ }
+-
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
diff --git a/patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch b/patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch
index 5752fed09b9e..66da85792465 100644
--- a/patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch
+++ b/patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch
@@ -34,10 +34,11 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
int ret;
raw_spin_lock_irq(&lock->wait_lock);
-@@ -2397,6 +2398,24 @@ int rt_mutex_wait_proxy_lock(struct rt_m
- /* sleep on the mutex */
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
-
+@@ -2399,6 +2400,24 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
++
+ /*
+ * RT has a problem here when the wait got interrupted by a timeout
+ * or a signal. task->pi_blocked_on is still set. The task must
@@ -55,7 +56,6 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+ tsk->pi_blocked_on = NULL;
+ raw_spin_unlock(&tsk->pi_lock);
+ }
-+
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
diff --git a/patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch b/patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch
index 753d477ac205..84f2aac616bd 100644
--- a/patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch
+++ b/patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch
@@ -35,7 +35,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/* bitmap for indexing intel_iommus */
static int g_num_of_iommus;
-@@ -3716,10 +3716,8 @@ static void add_unmap(struct dmar_domain
+@@ -3719,10 +3719,8 @@ static void add_unmap(struct dmar_domain
struct intel_iommu *iommu;
struct deferred_flush_entry *entry;
struct deferred_flush_data *flush_data;
@@ -47,7 +47,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/* Flush all CPUs' entries to avoid deferring too much. If
* this becomes a bottleneck, can just flush us, and rely on
-@@ -3752,8 +3750,6 @@ static void add_unmap(struct dmar_domain
+@@ -3755,8 +3753,6 @@ static void add_unmap(struct dmar_domain
}
flush_data->size++;
spin_unlock_irqrestore(&flush_data->lock, flags);
diff --git a/patches/irqwork-Move-irq-safe-work-to-irq-context.patch b/patches/irqwork-Move-irq-safe-work-to-irq-context.patch
index c79e13418be4..33b7c138ec91 100644
--- a/patches/irqwork-Move-irq-safe-work-to-irq-context.patch
+++ b/patches/irqwork-Move-irq-safe-work-to-irq-context.patch
@@ -55,7 +55,7 @@ Cc: stable-rt@vger.kernel.org
* Synchronize against the irq_work @entry, ensures the entry is not
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
-@@ -1691,7 +1691,7 @@ void update_process_times(int user_tick)
+@@ -1644,7 +1644,7 @@ void update_process_times(int user_tick)
scheduler_tick();
run_local_timers();
rcu_check_callbacks(user_tick);
@@ -64,7 +64,7 @@ Cc: stable-rt@vger.kernel.org
if (in_irq())
irq_work_tick();
#endif
-@@ -1720,9 +1720,7 @@ static __latent_entropy void run_timer_s
+@@ -1684,9 +1684,7 @@ static __latent_entropy void run_timer_s
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
diff --git a/patches/irqwork-push_most_work_into_softirq_context.patch b/patches/irqwork-push_most_work_into_softirq_context.patch
index 424ee22771bc..7af377095b54 100644
--- a/patches/irqwork-push_most_work_into_softirq_context.patch
+++ b/patches/irqwork-push_most_work_into_softirq_context.patch
@@ -163,7 +163,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
/*
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
-@@ -1691,7 +1691,7 @@ void update_process_times(int user_tick)
+@@ -1644,7 +1644,7 @@ void update_process_times(int user_tick)
scheduler_tick();
run_local_timers();
rcu_check_callbacks(user_tick);
@@ -172,7 +172,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
if (in_irq())
irq_work_tick();
#endif
-@@ -1720,6 +1720,10 @@ static __latent_entropy void run_timer_s
+@@ -1684,6 +1684,10 @@ static __latent_entropy void run_timer_s
{
struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]);
diff --git a/patches/localversion.patch b/patches/localversion.patch
index 19d7ea05016c..d7c1a50b87ee 100644
--- a/patches/localversion.patch
+++ b/patches/localversion.patch
@@ -10,4 +10,4 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- /dev/null
+++ b/localversion-rt
@@ -0,0 +1 @@
-+-rt19
++-rt20
diff --git a/patches/md-raid5-percpu-handling-rt-aware.patch b/patches/md-raid5-percpu-handling-rt-aware.patch
index 2593aa1b7012..16e023890d14 100644
--- a/patches/md-raid5-percpu-handling-rt-aware.patch
+++ b/patches/md-raid5-percpu-handling-rt-aware.patch
@@ -41,7 +41,7 @@ Tested-by: Udo van den Heuvel <udovdh@xs4all.nl>
}
static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp,
-@@ -6391,6 +6393,7 @@ static int raid456_cpu_up_prepare(unsign
+@@ -6393,6 +6395,7 @@ static int raid456_cpu_up_prepare(unsign
__func__, cpu);
return -ENOMEM;
}
@@ -49,7 +49,7 @@ Tested-by: Udo van den Heuvel <udovdh@xs4all.nl>
return 0;
}
-@@ -6401,7 +6404,6 @@ static int raid5_alloc_percpu(struct r5c
+@@ -6403,7 +6406,6 @@ static int raid5_alloc_percpu(struct r5c
conf->percpu = alloc_percpu(struct raid5_percpu);
if (!conf->percpu)
return -ENOMEM;
diff --git a/patches/mips-disable-highmem-on-rt.patch b/patches/mips-disable-highmem-on-rt.patch
index 3a4f8ac2ac56..0ff1fe5302bc 100644
--- a/patches/mips-disable-highmem-on-rt.patch
+++ b/patches/mips-disable-highmem-on-rt.patch
@@ -11,7 +11,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
-@@ -2515,7 +2515,7 @@ config MIPS_ASID_BITS_VARIABLE
+@@ -2516,7 +2516,7 @@ config MIPS_ASID_BITS_VARIABLE
#
config HIGHMEM
bool "High Memory Support"
diff --git a/patches/mm-convert-swap-to-percpu-locked.patch b/patches/mm-convert-swap-to-percpu-locked.patch
index ef137ac5fc54..817ae137493c 100644
--- a/patches/mm-convert-swap-to-percpu-locked.patch
+++ b/patches/mm-convert-swap-to-percpu-locked.patch
@@ -44,7 +44,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
-@@ -6593,7 +6593,9 @@ static int page_alloc_cpu_notify(struct
+@@ -6594,7 +6594,9 @@ static int page_alloc_cpu_notify(struct
int cpu = (unsigned long)hcpu;
if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) {
diff --git a/patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch b/patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch
index f01f4c5ff914..7f9bca2c23a3 100644
--- a/patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch
+++ b/patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch
@@ -175,7 +175,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
return NULL;
}
-@@ -6557,6 +6581,7 @@ static int page_alloc_cpu_notify(struct
+@@ -6558,6 +6582,7 @@ static int page_alloc_cpu_notify(struct
void __init page_alloc_init(void)
{
hotcpu_notifier(page_alloc_cpu_notify, 0);
@@ -183,7 +183,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
}
/*
-@@ -7385,7 +7410,7 @@ void zone_pcp_reset(struct zone *zone)
+@@ -7386,7 +7411,7 @@ void zone_pcp_reset(struct zone *zone)
struct per_cpu_pageset *pset;
/* avoid races with drain_pages() */
@@ -192,7 +192,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
if (zone->pageset != &boot_pageset) {
for_each_online_cpu(cpu) {
pset = per_cpu_ptr(zone->pageset, cpu);
-@@ -7394,7 +7419,7 @@ void zone_pcp_reset(struct zone *zone)
+@@ -7395,7 +7420,7 @@ void zone_pcp_reset(struct zone *zone)
free_percpu(zone->pageset);
zone->pageset = &boot_pageset;
}
diff --git a/patches/rt-add-rt-locks.patch b/patches/rt-add-rt-locks.patch
index 34c8029f5463..90426cd427bf 100644
--- a/patches/rt-add-rt-locks.patch
+++ b/patches/rt-add-rt-locks.patch
@@ -2173,7 +2173,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
debug_rt_mutex_proxy_lock(lock, proxy_owner);
rt_mutex_set_owner(lock, proxy_owner);
}
-@@ -1904,3 +2293,25 @@ bool rt_mutex_cleanup_proxy_lock(struct
+@@ -1916,3 +2305,25 @@ bool rt_mutex_cleanup_proxy_lock(struct
return cleanup;
}
diff --git a/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch b/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch
index d9fe0dd73633..899d9e7b7b52 100644
--- a/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch
+++ b/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch
@@ -299,16 +299,16 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc>
rt_mutex_slowlock);
}
EXPORT_SYMBOL_GPL(rt_mutex_timed_lock);
-@@ -2241,7 +2377,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m
- set_current_state(TASK_INTERRUPTIBLE);
-
+@@ -2239,7 +2375,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m
+ raw_spin_lock_irq(&lock->wait_lock);
/* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
- ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter);
+ ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
-
- raw_spin_unlock_irq(&lock->wait_lock);
-
-@@ -2294,24 +2430,88 @@ bool rt_mutex_cleanup_proxy_lock(struct
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+@@ -2306,24 +2442,88 @@ bool rt_mutex_cleanup_proxy_lock(struct
return cleanup;
}
diff --git a/patches/series b/patches/series
index 7856dee8a9bf..2dbc3f45e48a 100644
--- a/patches/series
+++ b/patches/series
@@ -44,6 +44,7 @@ lockdep-Fix-per-cpu-static-objects.patch
0002-futex-Fix-small-and-harmless-looking-inconsistencies.patch
0003-futex-Clarify-mark_wake_futex-memory-barrier-usage.patch
0004-MAINTAINERS-Add-FUTEX-SUBSYSTEM.patch
+futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch
# Those two should vanish soon (not use PIT during bootup)
at91_dont_enable_disable_clock.patch
@@ -340,6 +341,7 @@ irq-allow-disabling-of-softirq-processing-in-irq-thread-context.patch
softirq-split-timer-softirqs-out-of-ksoftirqd.patch
softirq-wake-the-timer-softirq-if-needed.patch
timers-Don-t-wake-ktimersoftd-on-every-tick.patch
+Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch
rtmutex-trylock-is-okay-on-RT.patch
# compile fix due to rtmutex locks
diff --git a/patches/x86-kvm-require-const-tsc-for-rt.patch b/patches/x86-kvm-require-const-tsc-for-rt.patch
index f5ff76170216..1c55c606fcaa 100644
--- a/patches/x86-kvm-require-const-tsc-for-rt.patch
+++ b/patches/x86-kvm-require-const-tsc-for-rt.patch
@@ -14,7 +14,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
-@@ -5933,6 +5933,13 @@ int kvm_arch_init(void *opaque)
+@@ -5958,6 +5958,13 @@ int kvm_arch_init(void *opaque)
goto out;
}