diff options
18 files changed, 403 insertions, 30 deletions
diff --git a/patches/0003-rtmutex-Add-a-special-case-for-ww-mutex-handling.patch b/patches/0003-rtmutex-Add-a-special-case-for-ww-mutex-handling.patch index bd8827d7e3f6..43c1ffd5a2fd 100644 --- a/patches/0003-rtmutex-Add-a-special-case-for-ww-mutex-handling.patch +++ b/patches/0003-rtmutex-Add-a-special-case-for-ww-mutex-handling.patch @@ -17,7 +17,7 @@ Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c -@@ -1059,8 +1059,26 @@ static int __sched task_blocks_on_rt_mut +@@ -1097,8 +1097,26 @@ static int __sched task_blocks_on_rt_mut * which is wrong, as the other waiter is not in a deadlock * situation. */ diff --git a/patches/Add_localversion_for_-RT_release.patch b/patches/Add_localversion_for_-RT_release.patch index 7b0058411028..a3ac9f9ca746 100644 --- a/patches/Add_localversion_for_-RT_release.patch +++ b/patches/Add_localversion_for_-RT_release.patch @@ -15,4 +15,4 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- /dev/null +++ b/localversion-rt @@ -0,0 +1 @@ -+-rt13 ++-rt14 diff --git a/patches/block_mq__do_not_invoke_preempt_disable.patch b/patches/block_mq__do_not_invoke_preempt_disable.patch index 504d0bc04b02..f2cf6fa8e78e 100644 --- a/patches/block_mq__do_not_invoke_preempt_disable.patch +++ b/patches/block_mq__do_not_invoke_preempt_disable.patch @@ -18,7 +18,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- --- a/block/blk-mq.c +++ b/block/blk-mq.c -@@ -1572,14 +1572,14 @@ static void __blk_mq_delay_run_hw_queue( +@@ -1552,14 +1552,14 @@ static void __blk_mq_delay_run_hw_queue( return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { diff --git a/patches/locking-rtmutex-Dequeue-waiter-on-ww_mutex-deadlock.patch b/patches/locking-rtmutex-Dequeue-waiter-on-ww_mutex-deadlock.patch new file mode 100644 index 000000000000..7c0be09fdbba --- /dev/null +++ b/patches/locking-rtmutex-Dequeue-waiter-on-ww_mutex-deadlock.patch @@ -0,0 +1,42 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 25 Aug 2021 12:33:14 +0200 +Subject: [PATCH] locking/rtmutex: Dequeue waiter on ww_mutex deadlock + +The rt_mutex based ww_mutex variant queues the new waiter first in the +lock's rbtree before evaluating the ww_mutex specific conditions which +might decide that the waiter should back out. This check and conditional +exit happens before the waiter is enqueued into the PI chain. + +The failure handling at the call site assumes that the waiter, if it is the +top most waiter on the lock, is queued in the PI chain and then proceeds to +adjust the unmodified PI chain, which results in RB tree corruption. + +Dequeue the waiter from the lock waiter list in the ww_mutex error exit +path to prevent this. + +Fixes: add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex") +Reported-by: Sebastian Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20210825102454.042280541@linutronix.de +--- + kernel/locking/rtmutex.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1082,8 +1082,13 @@ static int __sched task_blocks_on_rt_mut + /* Check whether the waiter should back out immediately */ + rtm = container_of(lock, struct rt_mutex, rtmutex); + res = __ww_mutex_add_waiter(waiter, rtm, ww_ctx); +- if (res) ++ if (res) { ++ raw_spin_lock(&task->pi_lock); ++ rt_mutex_dequeue(lock, waiter); ++ task->pi_blocked_on = NULL; ++ raw_spin_unlock(&task->pi_lock); + return res; ++ } + } + + if (!owner) diff --git a/patches/locking-rtmutex-Dont-dereference-waiter-lockless.patch b/patches/locking-rtmutex-Dont-dereference-waiter-lockless.patch new file mode 100644 index 000000000000..b0b857c6e0e4 --- /dev/null +++ b/patches/locking-rtmutex-Dont-dereference-waiter-lockless.patch @@ -0,0 +1,79 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Wed, 25 Aug 2021 12:33:12 +0200 +Subject: [PATCH] locking/rtmutex: Dont dereference waiter lockless + +The new rt_mutex_spin_on_onwer() loop checks whether the spinning waiter is +still the top waiter on the lock by utilizing rt_mutex_top_waiter(), which +is broken because that function contains a sanity check which dereferences +the top waiter pointer to check whether the waiter belongs to the +lock. That's wrong in the lockless spinwait case: + + CPU 0 CPU 1 + rt_mutex_lock(lock) rt_mutex_lock(lock); + queue(waiter0) + waiter0 == rt_mutex_top_waiter(lock) + rt_mutex_spin_on_onwer(lock, waiter0) { queue(waiter1) + waiter1 == rt_mutex_top_waiter(lock) + ... + top_waiter = rt_mutex_top_waiter(lock) + leftmost = rb_first_cached(&lock->waiters); + -> signal + dequeue(waiter1) + destroy(waiter1) + w = rb_entry(leftmost, ....) + BUG_ON(w->lock != lock) <- UAF + +The BUG_ON() is correct for the case where the caller holds lock->wait_lock +which guarantees that the leftmost waiter entry cannot vanish. For the +lockless spinwait case it's broken. + +Create a new helper function which avoids the pointer dereference and just +compares the leftmost entry pointer with current's waiter pointer to +validate that currrent is still elegible for spinning. + +Fixes: 992caf7f1724 ("locking/rtmutex: Add adaptive spinwait mechanism") +Reported-by: Sebastian Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20210825102453.981720644@linutronix.de +--- + kernel/locking/rtmutex.c | 5 +++-- + kernel/locking/rtmutex_common.h | 13 +++++++++++++ + 2 files changed, 16 insertions(+), 2 deletions(-) + +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -1329,8 +1329,9 @@ static bool rtmutex_spin_on_owner(struct + * for CONFIG_PREEMPT_RCU=y) + * - the VCPU on which owner runs is preempted + */ +- if (!owner->on_cpu || waiter != rt_mutex_top_waiter(lock) || +- need_resched() || vcpu_is_preempted(task_cpu(owner))) { ++ if (!owner->on_cpu || need_resched() || ++ rt_mutex_waiter_is_top_waiter(lock, waiter) || ++ vcpu_is_preempted(task_cpu(owner))) { + res = false; + break; + } +--- a/kernel/locking/rtmutex_common.h ++++ b/kernel/locking/rtmutex_common.h +@@ -95,6 +95,19 @@ static inline int rt_mutex_has_waiters(s + return !RB_EMPTY_ROOT(&lock->waiters.rb_root); + } + ++/* ++ * Lockless speculative check whether @waiter is still the top waiter on ++ * @lock. This is solely comparing pointers and not derefencing the ++ * leftmost entry which might be about to vanish. ++ */ ++static inline bool rt_mutex_waiter_is_top_waiter(struct rt_mutex_base *lock, ++ struct rt_mutex_waiter *waiter) ++{ ++ struct rb_node *leftmost = rb_first_cached(&lock->waiters); ++ ++ return rb_entry(leftmost, struct rt_mutex_waiter, tree_entry) == waiter; ++} ++ + static inline struct rt_mutex_waiter *rt_mutex_top_waiter(struct rt_mutex_base *lock) + { + struct rb_node *leftmost = rb_first_cached(&lock->waiters); diff --git a/patches/locking-rtmutex-Prevent-spurious-EDEADLK-return-caus.patch b/patches/locking-rtmutex-Prevent-spurious-EDEADLK-return-caus.patch new file mode 100644 index 000000000000..8041c53b8cad --- /dev/null +++ b/patches/locking-rtmutex-Prevent-spurious-EDEADLK-return-caus.patch @@ -0,0 +1,65 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Thu, 26 Aug 2021 09:36:53 +0200 +Subject: [PATCH] locking/rtmutex: Prevent spurious EDEADLK return caused by + ww_mutexes + +rtmutex based ww_mutexes can legitimately create a cycle in the lock graph +which can be observed by a blocker which didn't cause the problem: + + P1: A, ww_A, ww_B + P2: ww_B, ww_A + P3: A + +P3 might therefore be trapped in the ww_mutex induced cycle and run into +the lock depth limitation of rt_mutex_adjust_prio_chain() which returns +-EDEADLK to the caller. + +Disable the deadlock detection walk when the chain walk observes a +ww_mutex to prevent this looping. + +[ tglx: Split it apart and added changelog ] + +Reported-by: Sebastian Siewior <bigeasy@linutronix.de> +Fixes: add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex") +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/YSeWjCHoK4v5OcOt@hirez.programming.kicks-ass.net +--- + kernel/locking/rtmutex.c | 25 +++++++++++++++++++++++++ + 1 file changed, 25 insertions(+) + +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -657,6 +657,31 @@ static int __sched rt_mutex_adjust_prio_ + goto out_unlock_pi; + + /* ++ * There could be 'spurious' loops in the lock graph due to ww_mutex, ++ * consider: ++ * ++ * P1: A, ww_A, ww_B ++ * P2: ww_B, ww_A ++ * P3: A ++ * ++ * P3 should not return -EDEADLK because it gets trapped in the cycle ++ * created by P1 and P2 (which will resolve -- and runs into ++ * max_lock_depth above). Therefore disable detect_deadlock such that ++ * the below termination condition can trigger once all relevant tasks ++ * are boosted. ++ * ++ * Even when we start with ww_mutex we can disable deadlock detection, ++ * since we would supress a ww_mutex induced deadlock at [6] anyway. ++ * Supressing it here however is not sufficient since we might still ++ * hit [6] due to adjustment driven iteration. ++ * ++ * NOTE: if someone were to create a deadlock between 2 ww_classes we'd ++ * utterly fail to report it; lockdep should. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && waiter->ww_ctx && detect_deadlock) ++ detect_deadlock = false; ++ ++ /* + * Drop out, when the task has no waiters. Note, + * top_waiter can be NULL, when we are in the deboosting + * mode! diff --git a/patches/locking-rtmutex-Return-success-on-deadlock-for-ww_mu.patch b/patches/locking-rtmutex-Return-success-on-deadlock-for-ww_mu.patch new file mode 100644 index 000000000000..1f8e636f2673 --- /dev/null +++ b/patches/locking-rtmutex-Return-success-on-deadlock-for-ww_mu.patch @@ -0,0 +1,53 @@ +From: Peter Zijlstra <peterz@infradead.org> +Date: Thu, 26 Aug 2021 10:48:18 +0200 +Subject: [PATCH] locking/rtmutex: Return success on deadlock for ww_mutex + waiters + +ww_mutexes can legitimately cause a deadlock situation in the lock graph +which is resolved afterwards by the wait/wound mechanics. The rtmutex chain +walk can detect such a deadlock and returns EDEADLK which in turn skips the +wait/wound mechanism and returns EDEADLK to the caller. That's wrong +because both lock chains might get EDEADLK or the wrong waiter would back +out. + +Detect that situation and return 'success' in case that the waiter which +initiated the chain walk is a ww_mutex with context. This allows the +wait/wound mechanics to resolve the situation according to the rules. + +[ tglx: Split it apart and added changelog ] + +Reported-by: Sebastian Siewior <bigeasy@linutronix.de> +Fixes: add461325ec5 ("locking/rtmutex: Extend the rtmutex core to support ww_mutex") +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/YSeWjCHoK4v5OcOt@hirez.programming.kicks-ass.net +--- + kernel/locking/rtmutex.c | 15 ++++++++++++++- + 1 file changed, 14 insertions(+), 1 deletion(-) + +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -717,8 +717,21 @@ static int __sched rt_mutex_adjust_prio_ + * walk, we detected a deadlock. + */ + if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { +- raw_spin_unlock(&lock->wait_lock); + ret = -EDEADLK; ++ ++ /* ++ * When the deadlock is due to ww_mutex; also see above. Don't ++ * report the deadlock and instead let the ww_mutex wound/die ++ * logic pick which of the contending threads gets -EDEADLK. ++ * ++ * NOTE: assumes the cycle only contains a single ww_class; any ++ * other configuration and we fail to report; also, see ++ * lockdep. ++ */ ++ if (IS_ENABLED(CONFIG_PREEMPT_RT) && orig_waiter->ww_ctx) ++ ret = 0; ++ ++ raw_spin_unlock(&lock->wait_lock); + goto out_unlock_pi; + } + diff --git a/patches/mm_zsmalloc__copy_with_get_cpu_var_and_locking.patch b/patches/mm_zsmalloc__copy_with_get_cpu_var_and_locking.patch index 5a8d7a611d4b..efe9b877e4a7 100644 --- a/patches/mm_zsmalloc__copy_with_get_cpu_var_and_locking.patch +++ b/patches/mm_zsmalloc__copy_with_get_cpu_var_and_locking.patch @@ -12,11 +12,9 @@ larger struct to allocate. Signed-off-by: Mike Galbraith <umgwanakikbuti@gmail.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> [bigeasy: replace the bitspin_lock() with a mutex, get_locked_var(). Mike then -fixed the size magic] +fixed the size magic, Mike made handle lock spinlock_t] Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> - - --- mm/zsmalloc.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 79 insertions(+), 6 deletions(-) @@ -39,7 +37,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> + +struct zsmalloc_handle { + unsigned long addr; -+ struct mutex lock; ++ spinlock_t lock; +}; + +#define ZS_HANDLE_ALLOC_SIZE (sizeof(struct zsmalloc_handle)) @@ -69,7 +67,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> 0, 0, NULL); if (!pool->handle_cachep) return 1; -@@ -346,9 +362,26 @@ static void destroy_cache(struct zs_pool +@@ -346,10 +362,27 @@ static void destroy_cache(struct zs_pool static unsigned long cache_alloc_handle(struct zs_pool *pool, gfp_t gfp) { @@ -83,21 +81,22 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> + if (p) { + struct zsmalloc_handle *zh = p; + -+ mutex_init(&zh->lock); ++ spin_lock_init(&zh->lock); + } +#endif + return (unsigned long)p; -+} -+ + } + +#ifdef CONFIG_PREEMPT_RT +static struct zsmalloc_handle *zs_get_pure_handle(unsigned long handle) +{ + return (void *)(handle &~((1 << OBJ_TAG_BITS) - 1)); - } ++} +#endif - ++ static void cache_free_handle(struct zs_pool *pool, unsigned long handle) { + kmem_cache_free(pool->handle_cachep, (void *)handle); @@ -368,12 +401,18 @@ static void cache_free_zspage(struct zs_ static void record_obj(unsigned long handle, unsigned long obj) @@ -150,7 +149,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + -+ return mutex_is_locked(&zh->lock); ++ return spin_is_locked(&zh->lock); +#else return bit_spin_is_locked(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif @@ -161,7 +160,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + -+ return mutex_trylock(&zh->lock); ++ return spin_trylock(&zh->lock); +#else return bit_spin_trylock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif @@ -172,7 +171,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + -+ return mutex_lock(&zh->lock); ++ return spin_lock(&zh->lock); +#else bit_spin_lock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif @@ -183,7 +182,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +#ifdef CONFIG_PREEMPT_RT + struct zsmalloc_handle *zh = zs_get_pure_handle(handle); + -+ return mutex_unlock(&zh->lock); ++ return spin_unlock(&zh->lock); +#else bit_spin_unlock(HANDLE_PIN_BIT, (unsigned long *)handle); +#endif diff --git a/patches/printk__remove_NMI_tracking.patch b/patches/printk__remove_NMI_tracking.patch index 7ce2b133b823..726f8c8b59c8 100644 --- a/patches/printk__remove_NMI_tracking.patch +++ b/patches/printk__remove_NMI_tracking.patch @@ -234,7 +234,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c -@@ -9803,7 +9803,7 @@ void ftrace_dump(enum ftrace_dump_mode o +@@ -9815,7 +9815,7 @@ void ftrace_dump(enum ftrace_dump_mode o tracing_off(); local_irq_save(flags); @@ -243,7 +243,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* Simulate the iterator */ trace_init_global_iter(&iter); -@@ -9885,7 +9885,7 @@ void ftrace_dump(enum ftrace_dump_mode o +@@ -9897,7 +9897,7 @@ void ftrace_dump(enum ftrace_dump_mode o atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } atomic_dec(&dump_running); diff --git a/patches/printk__remove_deferred_printing.patch b/patches/printk__remove_deferred_printing.patch index 690d02095422..fd867298ef48 100644 --- a/patches/printk__remove_deferred_printing.patch +++ b/patches/printk__remove_deferred_printing.patch @@ -761,7 +761,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c -@@ -9803,7 +9803,6 @@ void ftrace_dump(enum ftrace_dump_mode o +@@ -9815,7 +9815,6 @@ void ftrace_dump(enum ftrace_dump_mode o tracing_off(); local_irq_save(flags); @@ -769,7 +769,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* Simulate the iterator */ trace_init_global_iter(&iter); -@@ -9885,7 +9884,6 @@ void ftrace_dump(enum ftrace_dump_mode o +@@ -9897,7 +9896,6 @@ void ftrace_dump(enum ftrace_dump_mode o atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } atomic_dec(&dump_running); diff --git a/patches/rcu__Delay_RCU-selftests.patch b/patches/rcu__Delay_RCU-selftests.patch index ba0cf2f4ef51..3c6abe8cbb18 100644 --- a/patches/rcu__Delay_RCU-selftests.patch +++ b/patches/rcu__Delay_RCU-selftests.patch @@ -34,7 +34,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> void rcu_sysrq_end(void); --- a/init/main.c +++ b/init/main.c -@@ -1580,6 +1580,7 @@ static noinline void __init kernel_init_ +@@ -1585,6 +1585,7 @@ static noinline void __init kernel_init_ rcu_init_tasks_generic(); do_pre_smp_initcalls(); diff --git a/patches/sched-Fix-get_push_task-vs-migrate_disable.patch b/patches/sched-Fix-get_push_task-vs-migrate_disable.patch new file mode 100644 index 000000000000..2a0edf8fc5fc --- /dev/null +++ b/patches/sched-Fix-get_push_task-vs-migrate_disable.patch @@ -0,0 +1,38 @@ +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Thu, 26 Aug 2021 15:37:38 +0200 +Subject: [PATCH] sched: Fix get_push_task() vs migrate_disable() + +push_rt_task() attempts to move the currently running task away if the +next runnable task has migration disabled and therefore is pinned on the +current CPU. + +The current task is retrieved via get_push_task() which only checks for +nr_cpus_allowed == 1, but does not check whether the task has migration +disabled and therefore cannot be moved either. The consequence is a +pointless invocation of the migration thread which correctly observes +that the task cannot be moved. + +Return NULL if the task has migration disabled and cannot be moved to +another CPU. + +Cc: stable-rt@vger.kernel.org +Fixes: a7c81556ec4d3 ("sched: Fix migrate_disable() vs rt/dl balancing") +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Link: https://lkml.kernel.org/r/20210826133738.yiotqbtdaxzjsnfj@linutronix.de +--- + kernel/sched/sched.h | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2255,6 +2255,9 @@ static inline struct task_struct *get_pu + if (p->nr_cpus_allowed == 1) + return NULL; + ++ if (p->migration_disabled) ++ return NULL; ++ + rq->push_busy = true; + return get_task_struct(p); + } diff --git a/patches/sched-Prevent-balance_push-on-remote-runqueues.patch b/patches/sched-Prevent-balance_push-on-remote-runqueues.patch new file mode 100644 index 000000000000..b576aa4eb67d --- /dev/null +++ b/patches/sched-Prevent-balance_push-on-remote-runqueues.patch @@ -0,0 +1,50 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Fri, 27 Aug 2021 16:07:30 +0200 +Subject: [PATCH] sched: Prevent balance_push() on remote runqueues + +sched_setscheduler() and rt_mutex_setprio() invoke the run-queue balance +callback after changing priorities or the scheduling class of a task. The +run-queue for which the callback is invoked can be local or remote. + +That's not a problem for the regular rq::push_work which is serialized with +a busy flag in the run-queue struct, but for the balance_push() work which +is only valid to be invoked on the outgoing CPU that's wrong. It not only +triggers the debug warning, but also leaves the per CPU variable push_work +unprotected, which can result in double enqueues on the stop machine list. + +Remove the warning and check that the function is invoked on the +outgoing CPU. If not, just return and do nothing. + +Fixes: ae7927023243 ("sched: Optimize finish_lock_switch()") +Reported-by: Sebastian Siewior <bigeasy@linutronix.de> +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Cc: stable@vger.kernel.org +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Link: https://lore.kernel.org/r/87tujb0yn1.ffs@tglx +--- + kernel/sched/core.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8435,7 +8435,6 @@ static void balance_push(struct rq *rq) + struct task_struct *push_task = rq->curr; + + lockdep_assert_rq_held(rq); +- SCHED_WARN_ON(rq->cpu != smp_processor_id()); + + /* + * Ensure the thing is persistent until balance_push_set(.on = false); +@@ -8443,9 +8442,10 @@ static void balance_push(struct rq *rq) + rq->balance_callback = &balance_push_callback; + + /* +- * Only active while going offline. ++ * Only active while going offline and when invoked on the outgoing ++ * CPU. + */ +- if (!cpu_dying(rq->cpu)) ++ if (!cpu_dying(rq->cpu) && rq == this_rq()) + return; + + /* diff --git a/patches/sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch b/patches/sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch new file mode 100644 index 000000000000..cf894fb444c9 --- /dev/null +++ b/patches/sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch @@ -0,0 +1,39 @@ +From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +Date: Tue, 24 Aug 2021 22:47:37 +0200 +Subject: [PATCH] sched: Switch wait_task_inactive to HRTIMER_MODE_REL_HARD + +With PREEMPT_RT enabled all hrtimers callbacks will be invoked in +softirq mode unless they are explicitly marked as HRTIMER_MODE_HARD. +During boot kthread_bind() is used for the creation of per-CPU threads +and then hangs in wait_task_inactive() if the ksoftirqd is not +yet up and running. +The hang disappeared since commit + 26c7295be0c5e ("kthread: Do not preempt current task if it is going to call schedule()") + +but enabling function trace on boot reliably leads to the freeze on boot +behaviour again. +The timer in wait_task_inactive() can not be directly used by an user +interface to abuse it and create a mass wake of several tasks at the +same time which would to long sections with disabled interrupts. +Therefore it is safe to make the timer HRTIMER_MODE_REL_HARD. + +Switch the timer to HRTIMER_MODE_REL_HARD. + +Cc: stable-rt@vger.kernel.org +Link: https://lkml.kernel.org/r/20210826170408.vm7rlj7odslshwch@linutronix.de +Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> +--- + kernel/sched/core.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3033,7 +3033,7 @@ unsigned long wait_task_inactive(struct + ktime_t to = NSEC_PER_SEC / HZ; + + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); + continue; + } + diff --git a/patches/sched__Add_support_for_lazy_preemption.patch b/patches/sched__Add_support_for_lazy_preemption.patch index 28e357d431eb..8caa49943970 100644 --- a/patches/sched__Add_support_for_lazy_preemption.patch +++ b/patches/sched__Add_support_for_lazy_preemption.patch @@ -519,7 +519,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> /* --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h -@@ -2303,6 +2303,15 @@ extern void reweight_task(struct task_st +@@ -2306,6 +2306,15 @@ extern void reweight_task(struct task_st extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); @@ -556,7 +556,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } struct ring_buffer_event * -@@ -4182,15 +4190,17 @@ unsigned long trace_total_entries(struct +@@ -4194,15 +4202,17 @@ unsigned long trace_total_entries(struct static void print_lat_help_header(struct seq_file *m) { @@ -583,7 +583,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } static void print_event_info(struct array_buffer *buf, struct seq_file *m) -@@ -4224,14 +4234,16 @@ static void print_func_help_header_irq(s +@@ -4236,14 +4246,16 @@ static void print_func_help_header_irq(s print_event_info(buf, m); diff --git a/patches/series b/patches/series index e18684a1ab12..fb2046ef8376 100644 --- a/patches/series +++ b/patches/series @@ -79,6 +79,9 @@ printk__Enhance_the_condition_check_of_msleep_in_pr_flush.patch # Posted ########################################################################### highmem-Don-t-disable-preemption-on-RT-in-kmap_atomi.patch +sched-Fix-get_push_task-vs-migrate_disable.patch +sched-Switch-wait_task_inactive-to-HRTIMER_MODE_REL_.patch +sched-Prevent-balance_push-on-remote-runqueues.patch ########################################################################### # Post @@ -202,7 +205,12 @@ debugobjects__Make_RT_aware.patch 0070-locking-rtmutex-Add-adaptive-spinwait-mechanism.patch 0071-locking-spinlock-rt-Prepare-for-RT-local_lock.patch 0072-locking-local_lock-Add-PREEMPT_RT-support.patch + locking-ww_mutex-Initialize-waiter.ww_ctx-properly.patch +locking-rtmutex-Dont-dereference-waiter-lockless.patch +locking-rtmutex-Dequeue-waiter-on-ww_mutex-deadlock.patch +locking-rtmutex-Return-success-on-deadlock-for-ww_mu.patch +locking-rtmutex-Prevent-spurious-EDEADLK-return-caus.patch ########################################################################### # Locking: RT bits. Need review diff --git a/patches/shmem__Use_raw_spinlock_t_for_-stat_lock.patch b/patches/shmem__Use_raw_spinlock_t_for_-stat_lock.patch index 679909e60347..5e3cd9deb12b 100644 --- a/patches/shmem__Use_raw_spinlock_t_for_-stat_lock.patch +++ b/patches/shmem__Use_raw_spinlock_t_for_-stat_lock.patch @@ -97,7 +97,7 @@ Link: https://lore.kernel.org/r/20210806142916.jdwkb5bx62q5fwfo@linutronix.de } return mpol; } -@@ -3500,9 +3501,10 @@ static int shmem_reconfigure(struct fs_c +@@ -3488,9 +3489,10 @@ static int shmem_reconfigure(struct fs_c struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; @@ -109,7 +109,7 @@ Link: https://lore.kernel.org/r/20210806142916.jdwkb5bx62q5fwfo@linutronix.de inodes = sbinfo->max_inodes - sbinfo->free_inodes; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { -@@ -3547,14 +3549,15 @@ static int shmem_reconfigure(struct fs_c +@@ -3535,14 +3537,15 @@ static int shmem_reconfigure(struct fs_c * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { @@ -128,7 +128,7 @@ Link: https://lore.kernel.org/r/20210806142916.jdwkb5bx62q5fwfo@linutronix.de return invalfc(fc, "%s", err); } -@@ -3671,7 +3674,7 @@ static int shmem_fill_super(struct super +@@ -3659,7 +3662,7 @@ static int shmem_fill_super(struct super sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; diff --git a/patches/trace__Add_migrate-disabled_counter_to_tracing_output.patch b/patches/trace__Add_migrate-disabled_counter_to_tracing_output.patch index 828ff1ced76d..f1b1f779e71c 100644 --- a/patches/trace__Add_migrate-disabled_counter_to_tracing_output.patch +++ b/patches/trace__Add_migrate-disabled_counter_to_tracing_output.patch @@ -60,7 +60,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } struct ring_buffer_event * -@@ -4177,9 +4187,10 @@ static void print_lat_help_header(struct +@@ -4189,9 +4199,10 @@ static void print_lat_help_header(struct "# | / _----=> need-resched \n" "# || / _---=> hardirq/softirq \n" "# ||| / _--=> preempt-depth \n" @@ -74,7 +74,7 @@ Signed-off-by: Thomas Gleixner <tglx@linutronix.de> } static void print_event_info(struct array_buffer *buf, struct seq_file *m) -@@ -4217,9 +4228,10 @@ static void print_func_help_header_irq(s +@@ -4229,9 +4240,10 @@ static void print_func_help_header_irq(s seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); |