diff options
author | Sebastian Andrzej Siewior <bigeasy@linutronix.de> | 2017-05-27 10:04:35 +0200 |
---|---|---|
committer | Sebastian Andrzej Siewior <bigeasy@linutronix.de> | 2017-05-27 10:04:35 +0200 |
commit | 92747649c94d4d6b1cf3ee8b24caafe8c8c019e3 (patch) | |
tree | 5cf411b8809ad338ff660835611041b21f92a509 | |
parent | 67c6ffc95dc19132a0b3e4b672b66f604bc7c5bf (diff) | |
download | linux-rt-4.9.30-rt20-patches.tar.gz |
[ANNOUNCE] v4.9.30-rt20v4.9.30-rt20-patches
Dear RT folks!
I'm pleased to announce the v4.9.30-rt20 patch set.
Changes since v4.9.30-rt19:
- The patch "timers: Don't wake ktimersoftd on every tick" has been
reverted because this optimisation can lead to timers not getting
expired. Reported by Klaus Gusenleitner, debugged by Anna-Maria
Gleixner.
- Markus Trippelsdorf reported that the new futex code makes the
glibc/nptl/tst-robustpi8 test fail. Patch by Peter Zijlstra.
Known issues
- CPU hotplug got a little better but can deadlock.
- gdb. While gdb is following a task it is possible that after a
fork() operation the task is waiting for gdb and gdb waiting
for the task.
The delta patch against v4.9.30-rt19 is appended below and can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/incr/patch-4.9.30-rt19-rt20.patch.xz
You can get this release via the git tree at:
git://git.kernel.org/pub/scm/linux/kernel/git/rt/linux-rt-devel.git v4.9.30-rt20
The RT patch against v4.9.30 can be found here:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patch-4.9.30-rt20.patch.xz
The split quilt queue is available at:
https://cdn.kernel.org/pub/linux/kernel/projects/rt/4.9/older/patches-4.9.30-rt20.tar.xz
Sebastian
diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c
--- a/kernel/locking/rtmutex.c
+++ b/kernel/locking/rtmutex.c
@@ -2400,11 +2400,14 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
int ret;
raw_spin_lock_irq(&lock->wait_lock);
-
- set_current_state(TASK_INTERRUPTIBLE);
-
/* sleep on the mutex */
+ set_current_state(TASK_INTERRUPTIBLE);
ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL);
+ /*
+ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+ * have to fix that up.
+ */
+ fixup_rt_mutex_waiters(lock);
/*
* RT has a problem here when the wait got interrupted by a timeout
@@ -2423,7 +2426,6 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock,
tsk->pi_blocked_on = NULL;
raw_spin_unlock(&tsk->pi_lock);
}
-
raw_spin_unlock_irq(&lock->wait_lock);
return ret;
@@ -2455,15 +2457,25 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock,
raw_spin_lock_irq(&lock->wait_lock);
/*
+ * Do an unconditional try-lock, this deals with the lock stealing
+ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter()
+ * sets a NULL owner.
+ *
+ * We're not interested in the return value, because the subsequent
+ * test on rt_mutex_owner() will infer that. If the trylock succeeded,
+ * we will own the lock and it will have removed the waiter. If we
+ * failed the trylock, we're still not owner and we need to remove
+ * ourselves.
+ */
+ try_to_take_rt_mutex(lock, current, waiter);
+ /*
* Unless we're the owner; we're still enqueued on the wait_list.
* So check if we became owner, if not, take us off the wait_list.
*/
if (rt_mutex_owner(lock) != current) {
remove_waiter(lock, waiter);
- fixup_rt_mutex_waiters(lock);
cleanup = true;
}
-
/*
* try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
* have to fix that up.
diff --git a/kernel/time/timer.c b/kernel/time/timer.c
--- a/kernel/time/timer.c
+++ b/kernel/time/timer.c
@@ -206,8 +206,6 @@ struct timer_base {
bool is_idle;
DECLARE_BITMAP(pending_map, WHEEL_SIZE);
struct hlist_head vectors[WHEEL_SIZE];
- struct hlist_head expired_lists[LVL_DEPTH];
- int expired_count;
} ____cacheline_aligned;
static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]);
@@ -1355,8 +1353,7 @@ static void call_timer_fn(struct timer_list *timer, void (*fn)(unsigned long),
}
}
-static inline void __expire_timers(struct timer_base *base,
- struct hlist_head *head)
+static void expire_timers(struct timer_base *base, struct hlist_head *head)
{
while (!hlist_empty(head)) {
struct timer_list *timer;
@@ -1387,38 +1384,21 @@ static inline void __expire_timers(struct timer_base *base,
}
}
-static void expire_timers(struct timer_base *base)
-{
- struct hlist_head *head;
-
- while (base->expired_count--) {
- head = base->expired_lists + base->expired_count;
- __expire_timers(base, head);
- }
- base->expired_count = 0;
-}
-
-static void __collect_expired_timers(struct timer_base *base)
+static int __collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
{
unsigned long clk = base->clk;
struct hlist_head *vec;
- int i;
+ int i, levels = 0;
unsigned int idx;
- /*
- * expire_timers() must be called at least once before we can
- * collect more timers
- */
- if (WARN_ON(base->expired_count))
- return;
-
for (i = 0; i < LVL_DEPTH; i++) {
idx = (clk & LVL_MASK) + i * LVL_SIZE;
if (__test_and_clear_bit(idx, base->pending_map)) {
vec = base->vectors + idx;
- hlist_move_list(vec,
- &base->expired_lists[base->expired_count++]);
+ hlist_move_list(vec, heads++);
+ levels++;
}
/* Is it time to look at the next level? */
if (clk & LVL_CLK_MASK)
@@ -1426,6 +1406,7 @@ static void __collect_expired_timers(struct timer_base *base)
/* Shift clock for the next level granularity */
clk >>= LVL_CLK_SHIFT;
}
+ return levels;
}
#ifdef CONFIG_NO_HZ_COMMON
@@ -1618,7 +1599,8 @@ void timer_clear_idle(void)
base->is_idle = false;
}
-static void collect_expired_timers(struct timer_base *base)
+static int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
{
/*
* NOHZ optimization. After a long idle sleep we need to forward the
@@ -1635,49 +1617,20 @@ static void collect_expired_timers(struct timer_base *base)
if (time_after(next, jiffies)) {
/* The call site will increment clock! */
base->clk = jiffies - 1;
- return;
+ return 0;
}
base->clk = next;
}
- __collect_expired_timers(base);
+ return __collect_expired_timers(base, heads);
}
#else
-static inline void collect_expired_timers(struct timer_base *base)
+static inline int collect_expired_timers(struct timer_base *base,
+ struct hlist_head *heads)
{
- __collect_expired_timers(base);
+ return __collect_expired_timers(base, heads);
}
#endif
-static int find_expired_timers(struct timer_base *base)
-{
- const unsigned long int end_clk = jiffies;
-
- while (!base->expired_count && time_after_eq(end_clk, base->clk)) {
- collect_expired_timers(base);
- base->clk++;
- }
-
- return base->expired_count;
-}
-
-/* Called from CPU tick routine to quickly collect expired timers */
-static int tick_find_expired(struct timer_base *base)
-{
- int count;
-
- raw_spin_lock(&base->lock);
-
- if (unlikely(time_after(jiffies, base->clk + HZ))) {
- /* defer to ktimersoftd; don't spend too long in irq context */
- count = -1;
- } else
- count = find_expired_timers(base);
-
- raw_spin_unlock(&base->lock);
-
- return count;
-}
-
/*
* Called from the timer interrupt handler to charge one tick to the current
* process. user_tick is 1 if the tick is user time, 0 for system.
@@ -1704,11 +1657,22 @@ void update_process_times(int user_tick)
*/
static inline void __run_timers(struct timer_base *base)
{
+ struct hlist_head heads[LVL_DEPTH];
+ int levels;
+
+ if (!time_after_eq(jiffies, base->clk))
+ return;
+
raw_spin_lock_irq(&base->lock);
- while (find_expired_timers(base))
- expire_timers(base);
+ while (time_after_eq(jiffies, base->clk)) {
+ levels = collect_expired_timers(base, heads);
+ base->clk++;
+
+ while (levels--)
+ expire_timers(base, heads + levels);
+ }
raw_spin_unlock_irq(&base->lock);
wakeup_timer_waiters(base);
}
@@ -1736,12 +1700,12 @@ void run_local_timers(void)
hrtimer_run_queues();
/* Raise the softirq only if required. */
- if (time_before(jiffies, base->clk) || !tick_find_expired(base)) {
+ if (time_before(jiffies, base->clk)) {
if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active)
return;
/* CPU is awake, so check the deferrable base. */
base++;
- if (time_before(jiffies, base->clk) || !tick_find_expired(base))
+ if (time_before(jiffies, base->clk))
return;
}
raise_softirq(TIMER_SOFTIRQ);
@@ -1911,7 +1875,6 @@ int timers_dead_cpu(unsigned int cpu)
raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING);
BUG_ON(old_base->running_timer);
- BUG_ON(old_base->expired_count);
for (i = 0; i < WHEEL_SIZE; i++)
migrate_timer_list(new_base, old_base->vectors + i);
@@ -1938,7 +1901,6 @@ static void __init init_timer_cpu(int cpu)
#ifdef CONFIG_PREEMPT_RT_FULL
init_swait_queue_head(&base->wait_for_running_timer);
#endif
- base->expired_count = 0;
}
}
diff --git a/localversion-rt b/localversion-rt
--- a/localversion-rt
+++ b/localversion-rt
@@ -1 +1 @@
--rt19
+-rt20
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
-rw-r--r-- | patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch | 4 | ||||
-rw-r--r-- | patches/Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch | 217 | ||||
-rw-r--r-- | patches/futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch | 125 | ||||
-rw-r--r-- | patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch | 10 | ||||
-rw-r--r-- | patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch | 4 | ||||
-rw-r--r-- | patches/irqwork-Move-irq-safe-work-to-irq-context.patch | 4 | ||||
-rw-r--r-- | patches/irqwork-push_most_work_into_softirq_context.patch | 4 | ||||
-rw-r--r-- | patches/localversion.patch | 2 | ||||
-rw-r--r-- | patches/md-raid5-percpu-handling-rt-aware.patch | 4 | ||||
-rw-r--r-- | patches/mips-disable-highmem-on-rt.patch | 2 | ||||
-rw-r--r-- | patches/mm-convert-swap-to-percpu-locked.patch | 2 | ||||
-rw-r--r-- | patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch | 6 | ||||
-rw-r--r-- | patches/rt-add-rt-locks.patch | 2 | ||||
-rw-r--r-- | patches/rtmutex-add-a-first-shot-of-ww_mutex.patch | 14 | ||||
-rw-r--r-- | patches/series | 2 | ||||
-rw-r--r-- | patches/x86-kvm-require-const-tsc-for-rt.patch | 2 |
16 files changed, 374 insertions, 30 deletions
diff --git a/patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch b/patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch index 47c94bdc0da4..28b3dc4c6424 100644 --- a/patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch +++ b/patches/NFSv4-replace-seqcount_t-with-a-seqlock_t.patch @@ -57,7 +57,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c -@@ -2695,7 +2695,7 @@ static int _nfs4_open_and_get_state(stru +@@ -2697,7 +2697,7 @@ static int _nfs4_open_and_get_state(stru unsigned int seq; int ret; @@ -66,7 +66,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> ret = _nfs4_proc_open(opendata); if (ret != 0) -@@ -2733,7 +2733,7 @@ static int _nfs4_open_and_get_state(stru +@@ -2735,7 +2735,7 @@ static int _nfs4_open_and_get_state(stru if (d_inode(dentry) == state->inode) { nfs_inode_attach_open_context(ctx); diff --git a/patches/Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch b/patches/Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch new file mode 100644 index 000000000000..3fa509c360c5 --- /dev/null +++ b/patches/Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch @@ -0,0 +1,217 @@ +From 16145f9c01a2e671aceb731050de9fbf977d31d0 Mon Sep 17 00:00:00 2001 +From: Anna-Maria Gleixner <anna-maria@linutronix.de> +Date: Fri, 26 May 2017 19:16:07 +0200 +Subject: [PATCH] Revert "timers: Don't wake ktimersoftd on every tick" + +This reverts commit 032f93cae150a ("timers: Don't wake ktimersoftd on +every tick"). + +The problem is that the look ahead optimization from the tick timer +interrupt context can race with the softirq thread expiring timer. As +a consequence the temporary hlist heads which hold the to expire +timers are overwritten and the timers which are already removed from +the wheel bucket for expiry are now dangling w/o a list head. + +That means those timers never get expired. If one of those timers is +canceled the removal operation will result in a hlist corruption. + +Signed-off-by: Anna-Maria Gleixner <anna-maria@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/time/timer.c | 96 +++++++++++++++------------------------------------- + 1 file changed, 29 insertions(+), 67 deletions(-) + +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -206,8 +206,6 @@ struct timer_base { + bool is_idle; + DECLARE_BITMAP(pending_map, WHEEL_SIZE); + struct hlist_head vectors[WHEEL_SIZE]; +- struct hlist_head expired_lists[LVL_DEPTH]; +- int expired_count; + } ____cacheline_aligned; + + static DEFINE_PER_CPU(struct timer_base, timer_bases[NR_BASES]); +@@ -1355,8 +1353,7 @@ static void call_timer_fn(struct timer_l + } + } + +-static inline void __expire_timers(struct timer_base *base, +- struct hlist_head *head) ++static void expire_timers(struct timer_base *base, struct hlist_head *head) + { + while (!hlist_empty(head)) { + struct timer_list *timer; +@@ -1387,38 +1384,21 @@ static inline void __expire_timers(struc + } + } + +-static void expire_timers(struct timer_base *base) +-{ +- struct hlist_head *head; +- +- while (base->expired_count--) { +- head = base->expired_lists + base->expired_count; +- __expire_timers(base, head); +- } +- base->expired_count = 0; +-} +- +-static void __collect_expired_timers(struct timer_base *base) ++static int __collect_expired_timers(struct timer_base *base, ++ struct hlist_head *heads) + { + unsigned long clk = base->clk; + struct hlist_head *vec; +- int i; ++ int i, levels = 0; + unsigned int idx; + +- /* +- * expire_timers() must be called at least once before we can +- * collect more timers +- */ +- if (WARN_ON(base->expired_count)) +- return; +- + for (i = 0; i < LVL_DEPTH; i++) { + idx = (clk & LVL_MASK) + i * LVL_SIZE; + + if (__test_and_clear_bit(idx, base->pending_map)) { + vec = base->vectors + idx; +- hlist_move_list(vec, +- &base->expired_lists[base->expired_count++]); ++ hlist_move_list(vec, heads++); ++ levels++; + } + /* Is it time to look at the next level? */ + if (clk & LVL_CLK_MASK) +@@ -1426,6 +1406,7 @@ static void __collect_expired_timers(str + /* Shift clock for the next level granularity */ + clk >>= LVL_CLK_SHIFT; + } ++ return levels; + } + + #ifdef CONFIG_NO_HZ_COMMON +@@ -1618,7 +1599,8 @@ void timer_clear_idle(void) + base->is_idle = false; + } + +-static void collect_expired_timers(struct timer_base *base) ++static int collect_expired_timers(struct timer_base *base, ++ struct hlist_head *heads) + { + /* + * NOHZ optimization. After a long idle sleep we need to forward the +@@ -1635,49 +1617,20 @@ static void collect_expired_timers(struc + if (time_after(next, jiffies)) { + /* The call site will increment clock! */ + base->clk = jiffies - 1; +- return; ++ return 0; + } + base->clk = next; + } +- __collect_expired_timers(base); ++ return __collect_expired_timers(base, heads); + } + #else +-static inline void collect_expired_timers(struct timer_base *base) ++static inline int collect_expired_timers(struct timer_base *base, ++ struct hlist_head *heads) + { +- __collect_expired_timers(base); ++ return __collect_expired_timers(base, heads); + } + #endif + +-static int find_expired_timers(struct timer_base *base) +-{ +- const unsigned long int end_clk = jiffies; +- +- while (!base->expired_count && time_after_eq(end_clk, base->clk)) { +- collect_expired_timers(base); +- base->clk++; +- } +- +- return base->expired_count; +-} +- +-/* Called from CPU tick routine to quickly collect expired timers */ +-static int tick_find_expired(struct timer_base *base) +-{ +- int count; +- +- raw_spin_lock(&base->lock); +- +- if (unlikely(time_after(jiffies, base->clk + HZ))) { +- /* defer to ktimersoftd; don't spend too long in irq context */ +- count = -1; +- } else +- count = find_expired_timers(base); +- +- raw_spin_unlock(&base->lock); +- +- return count; +-} +- + /* + * Called from the timer interrupt handler to charge one tick to the current + * process. user_tick is 1 if the tick is user time, 0 for system. +@@ -1704,11 +1657,22 @@ void update_process_times(int user_tick) + */ + static inline void __run_timers(struct timer_base *base) + { ++ struct hlist_head heads[LVL_DEPTH]; ++ int levels; ++ ++ if (!time_after_eq(jiffies, base->clk)) ++ return; ++ + raw_spin_lock_irq(&base->lock); + +- while (find_expired_timers(base)) +- expire_timers(base); ++ while (time_after_eq(jiffies, base->clk)) { ++ ++ levels = collect_expired_timers(base, heads); ++ base->clk++; + ++ while (levels--) ++ expire_timers(base, heads + levels); ++ } + raw_spin_unlock_irq(&base->lock); + wakeup_timer_waiters(base); + } +@@ -1734,12 +1698,12 @@ void run_local_timers(void) + + hrtimer_run_queues(); + /* Raise the softirq only if required. */ +- if (time_before(jiffies, base->clk) || !tick_find_expired(base)) { ++ if (time_before(jiffies, base->clk)) { + if (!IS_ENABLED(CONFIG_NO_HZ_COMMON) || !base->nohz_active) + return; + /* CPU is awake, so check the deferrable base. */ + base++; +- if (time_before(jiffies, base->clk) || !tick_find_expired(base)) ++ if (time_before(jiffies, base->clk)) + return; + } + raise_softirq(TIMER_SOFTIRQ); +@@ -1909,7 +1873,6 @@ int timers_dead_cpu(unsigned int cpu) + raw_spin_lock_nested(&old_base->lock, SINGLE_DEPTH_NESTING); + + BUG_ON(old_base->running_timer); +- BUG_ON(old_base->expired_count); + + for (i = 0; i < WHEEL_SIZE; i++) + migrate_timer_list(new_base, old_base->vectors + i); +@@ -1936,7 +1899,6 @@ static void __init init_timer_cpu(int cp + #ifdef CONFIG_PREEMPT_RT_FULL + init_swait_queue_head(&base->wait_for_running_timer); + #endif +- base->expired_count = 0; + } + } + diff --git a/patches/futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch b/patches/futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch new file mode 100644 index 000000000000..21b716ce5196 --- /dev/null +++ b/patches/futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch @@ -0,0 +1,125 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Mon, 22 May 2017 13:04:50 -0700 +Subject: [PATCH] futex,rt_mutex: Fix rt_mutex_cleanup_proxy_lock() + +Markus reported that the glibc/nptl/tst-robustpi8 test was failing after +commit: + + cfafcd117da0 ("futex: Rework futex_lock_pi() to use rt_mutex_*_proxy_lock()") + +The following trace shows the problem: + + ld-linux-x86-64-2161 [019] .... 410.760971: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_LOCK_PI + ld-linux-x86-64-2161 [019] ...1 410.760972: lock_pi_update_atomic: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000875 ret=0 + ld-linux-x86-64-2165 [011] .... 410.760978: SyS_futex: 00007ffbeb76b028: 80000875 op=FUTEX_UNLOCK_PI + ld-linux-x86-64-2165 [011] d..1 410.760979: do_futex: 00007ffbeb76b028: curval=80000875 uval=80000875 newval=80000871 ret=0 + ld-linux-x86-64-2165 [011] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=0000 + ld-linux-x86-64-2161 [019] .... 410.760980: SyS_futex: 00007ffbeb76b028: 80000871 ret=ETIMEDOUT + +Task 2165 does an UNLOCK_PI, assigning the lock to the waiter task 2161 +which then returns with -ETIMEDOUT. That wrecks the lock state, because now +the owner isn't aware it acquired the lock and removes the pending robust +list entry. + +If 2161 is killed, the robust list will not clear out this futex and the +subsequent acquire on this futex will then (correctly) result in -ESRCH +which is unexpected by glibc, triggers an internal assertion and dies. + +Task 2161 Task 2165 + +rt_mutex_wait_proxy_lock() + timeout(); + /* T2161 is still queued in the waiter list */ + return -ETIMEDOUT; + + futex_unlock_pi() + spin_lock(hb->lock); + rtmutex_unlock() + remove_rtmutex_waiter(T2161); + mark_lock_available(); + /* Make the next waiter owner of the user space side */ + futex_uval = 2161; + spin_unlock(hb->lock); +spin_lock(hb->lock); +rt_mutex_cleanup_proxy_lock() + if (rtmutex_owner() !== current) + ... + return FAIL; +.... +return -ETIMEOUT; + +This means that rt_mutex_cleanup_proxy_lock() needs to call +try_to_take_rt_mutex() so it can take over the rtmutex correctly which was +assigned by the waker. If the rtmutex is owned by some other task then this +call is harmless and just confirmes that the waiter is not able to acquire +it. + +While there, fix what looks like a merge error which resulted in +rt_mutex_cleanup_proxy_lock() having two calls to +fixup_rt_mutex_waiters() and rt_mutex_wait_proxy_lock() not having any. +Both should have one, since both potentially touch the waiter list. + +Fixes: 38d589f2fd08 ("futex,rt_mutex: Restructure rt_mutex_finish_proxy_lock()") +Reported-by: Markus Trippelsdorf <markus@trippelsdorf.de> +Bug-Spotted-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Cc: Florian Weimer <fweimer@redhat.com> +Cc: Darren Hart <dvhart@infradead.org> +Cc: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Cc: Markus Trippelsdorf <markus@trippelsdorf.de> +Link: http://lkml.kernel.org/r/20170519154850.mlomgdsd26drq5j6@hirez.programming.kicks-ass.net +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/locking/rtmutex.c | 24 ++++++++++++++++++------ + 1 file changed, 18 insertions(+), 6 deletions(-) + +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1775,12 +1775,14 @@ int rt_mutex_wait_proxy_lock(struct rt_m + int ret; + + raw_spin_lock_irq(&lock->wait_lock); +- +- set_current_state(TASK_INTERRUPTIBLE); +- + /* sleep on the mutex */ ++ set_current_state(TASK_INTERRUPTIBLE); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); +- ++ /* ++ * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might ++ * have to fix that up. ++ */ ++ fixup_rt_mutex_waiters(lock); + raw_spin_unlock_irq(&lock->wait_lock); + + return ret; +@@ -1812,15 +1814,25 @@ bool rt_mutex_cleanup_proxy_lock(struct + + raw_spin_lock_irq(&lock->wait_lock); + /* ++ * Do an unconditional try-lock, this deals with the lock stealing ++ * state where __rt_mutex_futex_unlock() -> mark_wakeup_next_waiter() ++ * sets a NULL owner. ++ * ++ * We're not interested in the return value, because the subsequent ++ * test on rt_mutex_owner() will infer that. If the trylock succeeded, ++ * we will own the lock and it will have removed the waiter. If we ++ * failed the trylock, we're still not owner and we need to remove ++ * ourselves. ++ */ ++ try_to_take_rt_mutex(lock, current, waiter); ++ /* + * Unless we're the owner; we're still enqueued on the wait_list. + * So check if we became owner, if not, take us off the wait_list. + */ + if (rt_mutex_owner(lock) != current) { + remove_waiter(lock, waiter); +- fixup_rt_mutex_waiters(lock); + cleanup = true; + } +- + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. diff --git a/patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch b/patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch index 5752fed09b9e..66da85792465 100644 --- a/patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch +++ b/patches/futex-rtmutex-Cure-RT-double-blocking-issue.patch @@ -34,10 +34,11 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> int ret; raw_spin_lock_irq(&lock->wait_lock); -@@ -2397,6 +2398,24 @@ int rt_mutex_wait_proxy_lock(struct rt_m - /* sleep on the mutex */ - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); - +@@ -2399,6 +2400,24 @@ int rt_mutex_wait_proxy_lock(struct rt_m + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); ++ + /* + * RT has a problem here when the wait got interrupted by a timeout + * or a signal. task->pi_blocked_on is still set. The task must @@ -55,7 +56,6 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> + tsk->pi_blocked_on = NULL; + raw_spin_unlock(&tsk->pi_lock); + } -+ raw_spin_unlock_irq(&lock->wait_lock); return ret; diff --git a/patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch b/patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch index 753d477ac205..84f2aac616bd 100644 --- a/patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch +++ b/patches/iommu-vt-d-don-t-disable-preemption-while-accessing-.patch @@ -35,7 +35,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> /* bitmap for indexing intel_iommus */ static int g_num_of_iommus; -@@ -3716,10 +3716,8 @@ static void add_unmap(struct dmar_domain +@@ -3719,10 +3719,8 @@ static void add_unmap(struct dmar_domain struct intel_iommu *iommu; struct deferred_flush_entry *entry; struct deferred_flush_data *flush_data; @@ -47,7 +47,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> /* Flush all CPUs' entries to avoid deferring too much. If * this becomes a bottleneck, can just flush us, and rely on -@@ -3752,8 +3750,6 @@ static void add_unmap(struct dmar_domain +@@ -3755,8 +3753,6 @@ static void add_unmap(struct dmar_domain } flush_data->size++; spin_unlock_irqrestore(&flush_data->lock, flags); diff --git a/patches/irqwork-Move-irq-safe-work-to-irq-context.patch b/patches/irqwork-Move-irq-safe-work-to-irq-context.patch index c79e13418be4..33b7c138ec91 100644 --- a/patches/irqwork-Move-irq-safe-work-to-irq-context.patch +++ b/patches/irqwork-Move-irq-safe-work-to-irq-context.patch @@ -55,7 +55,7 @@ Cc: stable-rt@vger.kernel.org * Synchronize against the irq_work @entry, ensures the entry is not --- a/kernel/time/timer.c +++ b/kernel/time/timer.c -@@ -1691,7 +1691,7 @@ void update_process_times(int user_tick) +@@ -1644,7 +1644,7 @@ void update_process_times(int user_tick) scheduler_tick(); run_local_timers(); rcu_check_callbacks(user_tick); @@ -64,7 +64,7 @@ Cc: stable-rt@vger.kernel.org if (in_irq()) irq_work_tick(); #endif -@@ -1720,9 +1720,7 @@ static __latent_entropy void run_timer_s +@@ -1684,9 +1684,7 @@ static __latent_entropy void run_timer_s { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); diff --git a/patches/irqwork-push_most_work_into_softirq_context.patch b/patches/irqwork-push_most_work_into_softirq_context.patch index 424ee22771bc..7af377095b54 100644 --- a/patches/irqwork-push_most_work_into_softirq_context.patch +++ b/patches/irqwork-push_most_work_into_softirq_context.patch @@ -163,7 +163,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> /* --- a/kernel/time/timer.c +++ b/kernel/time/timer.c -@@ -1691,7 +1691,7 @@ void update_process_times(int user_tick) +@@ -1644,7 +1644,7 @@ void update_process_times(int user_tick) scheduler_tick(); run_local_timers(); rcu_check_callbacks(user_tick); @@ -172,7 +172,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> if (in_irq()) irq_work_tick(); #endif -@@ -1720,6 +1720,10 @@ static __latent_entropy void run_timer_s +@@ -1684,6 +1684,10 @@ static __latent_entropy void run_timer_s { struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); diff --git a/patches/localversion.patch b/patches/localversion.patch index 19d7ea05016c..d7c1a50b87ee 100644 --- a/patches/localversion.patch +++ b/patches/localversion.patch @@ -10,4 +10,4 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ -+-rt19 ++-rt20 diff --git a/patches/md-raid5-percpu-handling-rt-aware.patch b/patches/md-raid5-percpu-handling-rt-aware.patch index 2593aa1b7012..16e023890d14 100644 --- a/patches/md-raid5-percpu-handling-rt-aware.patch +++ b/patches/md-raid5-percpu-handling-rt-aware.patch @@ -41,7 +41,7 @@ Tested-by: Udo van den Heuvel <udovdh@xs4all.nl> } static struct stripe_head *alloc_stripe(struct kmem_cache *sc, gfp_t gfp, -@@ -6391,6 +6393,7 @@ static int raid456_cpu_up_prepare(unsign +@@ -6393,6 +6395,7 @@ static int raid456_cpu_up_prepare(unsign __func__, cpu); return -ENOMEM; } @@ -49,7 +49,7 @@ Tested-by: Udo van den Heuvel <udovdh@xs4all.nl> return 0; } -@@ -6401,7 +6404,6 @@ static int raid5_alloc_percpu(struct r5c +@@ -6403,7 +6406,6 @@ static int raid5_alloc_percpu(struct r5c conf->percpu = alloc_percpu(struct raid5_percpu); if (!conf->percpu) return -ENOMEM; diff --git a/patches/mips-disable-highmem-on-rt.patch b/patches/mips-disable-highmem-on-rt.patch index 3a4f8ac2ac56..0ff1fe5302bc 100644 --- a/patches/mips-disable-highmem-on-rt.patch +++ b/patches/mips-disable-highmem-on-rt.patch @@ -11,7 +11,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig -@@ -2515,7 +2515,7 @@ config MIPS_ASID_BITS_VARIABLE +@@ -2516,7 +2516,7 @@ config MIPS_ASID_BITS_VARIABLE # config HIGHMEM bool "High Memory Support" diff --git a/patches/mm-convert-swap-to-percpu-locked.patch b/patches/mm-convert-swap-to-percpu-locked.patch index ef137ac5fc54..817ae137493c 100644 --- a/patches/mm-convert-swap-to-percpu-locked.patch +++ b/patches/mm-convert-swap-to-percpu-locked.patch @@ -44,7 +44,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } --- a/mm/page_alloc.c +++ b/mm/page_alloc.c -@@ -6593,7 +6593,9 @@ static int page_alloc_cpu_notify(struct +@@ -6594,7 +6594,9 @@ static int page_alloc_cpu_notify(struct int cpu = (unsigned long)hcpu; if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) { diff --git a/patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch b/patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch index f01f4c5ff914..7f9bca2c23a3 100644 --- a/patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch +++ b/patches/mm-page_alloc-rt-friendly-per-cpu-pages.patch @@ -175,7 +175,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> return NULL; } -@@ -6557,6 +6581,7 @@ static int page_alloc_cpu_notify(struct +@@ -6558,6 +6582,7 @@ static int page_alloc_cpu_notify(struct void __init page_alloc_init(void) { hotcpu_notifier(page_alloc_cpu_notify, 0); @@ -183,7 +183,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } /* -@@ -7385,7 +7410,7 @@ void zone_pcp_reset(struct zone *zone) +@@ -7386,7 +7411,7 @@ void zone_pcp_reset(struct zone *zone) struct per_cpu_pageset *pset; /* avoid races with drain_pages() */ @@ -192,7 +192,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); -@@ -7394,7 +7419,7 @@ void zone_pcp_reset(struct zone *zone) +@@ -7395,7 +7420,7 @@ void zone_pcp_reset(struct zone *zone) free_percpu(zone->pageset); zone->pageset = &boot_pageset; } diff --git a/patches/rt-add-rt-locks.patch b/patches/rt-add-rt-locks.patch index 34c8029f5463..90426cd427bf 100644 --- a/patches/rt-add-rt-locks.patch +++ b/patches/rt-add-rt-locks.patch @@ -2173,7 +2173,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); } -@@ -1904,3 +2293,25 @@ bool rt_mutex_cleanup_proxy_lock(struct +@@ -1916,3 +2305,25 @@ bool rt_mutex_cleanup_proxy_lock(struct return cleanup; } diff --git a/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch b/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch index d9fe0dd73633..899d9e7b7b52 100644 --- a/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch +++ b/patches/rtmutex-add-a-first-shot-of-ww_mutex.patch @@ -299,16 +299,16 @@ Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> rt_mutex_slowlock); } EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); -@@ -2241,7 +2377,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m - set_current_state(TASK_INTERRUPTIBLE); - +@@ -2239,7 +2375,7 @@ int rt_mutex_wait_proxy_lock(struct rt_m + raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ + set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); - - raw_spin_unlock_irq(&lock->wait_lock); - -@@ -2294,24 +2430,88 @@ bool rt_mutex_cleanup_proxy_lock(struct + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. +@@ -2306,24 +2442,88 @@ bool rt_mutex_cleanup_proxy_lock(struct return cleanup; } diff --git a/patches/series b/patches/series index 7856dee8a9bf..2dbc3f45e48a 100644 --- a/patches/series +++ b/patches/series @@ -44,6 +44,7 @@ lockdep-Fix-per-cpu-static-objects.patch 0002-futex-Fix-small-and-harmless-looking-inconsistencies.patch 0003-futex-Clarify-mark_wake_futex-memory-barrier-usage.patch 0004-MAINTAINERS-Add-FUTEX-SUBSYSTEM.patch +futex-rt_mutex-Fix-rt_mutex_cleanup_proxy_lock.patch # Those two should vanish soon (not use PIT during bootup) at91_dont_enable_disable_clock.patch @@ -340,6 +341,7 @@ irq-allow-disabling-of-softirq-processing-in-irq-thread-context.patch softirq-split-timer-softirqs-out-of-ksoftirqd.patch softirq-wake-the-timer-softirq-if-needed.patch timers-Don-t-wake-ktimersoftd-on-every-tick.patch +Revert-timers-Don-t-wake-ktimersoftd-on-every-tick.patch rtmutex-trylock-is-okay-on-RT.patch # compile fix due to rtmutex locks diff --git a/patches/x86-kvm-require-const-tsc-for-rt.patch b/patches/x86-kvm-require-const-tsc-for-rt.patch index f5ff76170216..1c55c606fcaa 100644 --- a/patches/x86-kvm-require-const-tsc-for-rt.patch +++ b/patches/x86-kvm-require-const-tsc-for-rt.patch @@ -14,7 +14,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c -@@ -5933,6 +5933,13 @@ int kvm_arch_init(void *opaque) +@@ -5958,6 +5958,13 @@ int kvm_arch_init(void *opaque) goto out; } |