diff options
Diffstat (limited to 'patches/0006-sched-wakeup-Prepare-for-RT-sleeping-spin-rwlocks.patch')
-rw-r--r-- | patches/0006-sched-wakeup-Prepare-for-RT-sleeping-spin-rwlocks.patch | 196 |
1 files changed, 196 insertions, 0 deletions
diff --git a/patches/0006-sched-wakeup-Prepare-for-RT-sleeping-spin-rwlocks.patch b/patches/0006-sched-wakeup-Prepare-for-RT-sleeping-spin-rwlocks.patch new file mode 100644 index 000000000000..9877dc34acb5 --- /dev/null +++ b/patches/0006-sched-wakeup-Prepare-for-RT-sleeping-spin-rwlocks.patch @@ -0,0 +1,196 @@ +From: Thomas Gleixner <tglx@linutronix.de> +Date: Sun, 15 Aug 2021 23:27:44 +0200 +Subject: [PATCH 06/72] sched/wakeup: Prepare for RT sleeping spin/rwlocks + +Waiting for spinlocks and rwlocks on non RT enabled kernels is task::state +preserving. Any wakeup which matches the state is valid. + +RT enabled kernels substitutes them with 'sleeping' spinlocks. This creates +an issue vs. task::__state. + +In order to block on the lock, the task has to overwrite task::__state and a +consecutive wakeup issued by the unlocker sets the state back to +TASK_RUNNING. As a consequence the task loses the state which was set +before the lock acquire and also any regular wakeup targeted at the task +while it is blocked on the lock. + +To handle this gracefully, add a 'saved_state' member to task_struct which +is used in the following way: + + 1) When a task blocks on a 'sleeping' spinlock, the current state is saved + in task::saved_state before it is set to TASK_RTLOCK_WAIT. + + 2) When the task unblocks and after acquiring the lock, it restores the saved + state. + + 3) When a regular wakeup happens for a task while it is blocked then the + state change of that wakeup is redirected to operate on task::saved_state. + + This is also required when the task state is running because the task + might have been woken up from the lock wait and has not yet restored + the saved state. + +To make it complete, provide the necessary helpers to save and restore the +saved state along with the necessary documentation how the RT lock blocking +is supposed to work. + +For non-RT kernels there is no functional change. + +Signed-off-by: Thomas Gleixner <tglx@linutronix.de> +Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org> +Signed-off-by: Ingo Molnar <mingo@kernel.org> +Link: https://lore.kernel.org/r/20210815211302.258751046@linutronix.de +--- + include/linux/sched.h | 66 ++++++++++++++++++++++++++++++++++++++++++++++++++ + kernel/sched/core.c | 33 +++++++++++++++++++++++++ + 2 files changed, 99 insertions(+) + +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -143,9 +143,22 @@ struct task_group; + current->task_state_change = _THIS_IP_; \ + } while (0) + ++# define debug_rtlock_wait_set_state() \ ++ do { \ ++ current->saved_state_change = current->task_state_change;\ ++ current->task_state_change = _THIS_IP_; \ ++ } while (0) ++ ++# define debug_rtlock_wait_restore_state() \ ++ do { \ ++ current->task_state_change = current->saved_state_change;\ ++ } while (0) ++ + #else + # define debug_normal_state_change(cond) do { } while (0) + # define debug_special_state_change(cond) do { } while (0) ++# define debug_rtlock_wait_set_state() do { } while (0) ++# define debug_rtlock_wait_restore_state() do { } while (0) + #endif + + /* +@@ -213,6 +226,51 @@ struct task_group; + raw_spin_unlock_irqrestore(¤t->pi_lock, flags); \ + } while (0) + ++/* ++ * PREEMPT_RT specific variants for "sleeping" spin/rwlocks ++ * ++ * RT's spin/rwlock substitutions are state preserving. The state of the ++ * task when blocking on the lock is saved in task_struct::saved_state and ++ * restored after the lock has been acquired. These operations are ++ * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT ++ * lock related wakeups while the task is blocked on the lock are ++ * redirected to operate on task_struct::saved_state to ensure that these ++ * are not dropped. On restore task_struct::saved_state is set to ++ * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail. ++ * ++ * The lock operation looks like this: ++ * ++ * current_save_and_set_rtlock_wait_state(); ++ * for (;;) { ++ * if (try_lock()) ++ * break; ++ * raw_spin_unlock_irq(&lock->wait_lock); ++ * schedule_rtlock(); ++ * raw_spin_lock_irq(&lock->wait_lock); ++ * set_current_state(TASK_RTLOCK_WAIT); ++ * } ++ * current_restore_rtlock_saved_state(); ++ */ ++#define current_save_and_set_rtlock_wait_state() \ ++ do { \ ++ lockdep_assert_irqs_disabled(); \ ++ raw_spin_lock(¤t->pi_lock); \ ++ current->saved_state = current->__state; \ ++ debug_rtlock_wait_set_state(); \ ++ WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT); \ ++ raw_spin_unlock(¤t->pi_lock); \ ++ } while (0); ++ ++#define current_restore_rtlock_saved_state() \ ++ do { \ ++ lockdep_assert_irqs_disabled(); \ ++ raw_spin_lock(¤t->pi_lock); \ ++ debug_rtlock_wait_restore_state(); \ ++ WRITE_ONCE(current->__state, current->saved_state); \ ++ current->saved_state = TASK_RUNNING; \ ++ raw_spin_unlock(¤t->pi_lock); \ ++ } while (0); ++ + #define get_current_state() READ_ONCE(current->__state) + + /* Task command name length: */ +@@ -668,6 +726,11 @@ struct task_struct { + #endif + unsigned int __state; + ++#ifdef CONFIG_PREEMPT_RT ++ /* saved state for "spinlock sleepers" */ ++ unsigned int saved_state; ++#endif ++ + /* + * This begins the randomizable portion of task_struct. Only + * scheduling-critical items should be added above here. +@@ -1361,6 +1424,9 @@ struct task_struct { + struct kmap_ctrl kmap_ctrl; + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + unsigned long task_state_change; ++# ifdef CONFIG_PREEMPT_RT ++ unsigned long saved_state_change; ++# endif + #endif + int pagefault_disabled; + #ifdef CONFIG_MMU +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -3566,14 +3566,47 @@ static void ttwu_queue(struct task_struc + * + * The caller holds p::pi_lock if p != current or has preemption + * disabled when p == current. ++ * ++ * The rules of PREEMPT_RT saved_state: ++ * ++ * The related locking code always holds p::pi_lock when updating ++ * p::saved_state, which means the code is fully serialized in both cases. ++ * ++ * The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other ++ * bits set. This allows to distinguish all wakeup scenarios. + */ + static __always_inline + bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success) + { ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) { ++ WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) && ++ state != TASK_RTLOCK_WAIT); ++ } ++ + if (READ_ONCE(p->__state) & state) { + *success = 1; + return true; + } ++ ++#ifdef CONFIG_PREEMPT_RT ++ /* ++ * Saved state preserves the task state across blocking on ++ * an RT lock. If the state matches, set p::saved_state to ++ * TASK_RUNNING, but do not wake the task because it waits ++ * for a lock wakeup. Also indicate success because from ++ * the regular waker's point of view this has succeeded. ++ * ++ * After acquiring the lock the task will restore p::__state ++ * from p::saved_state which ensures that the regular ++ * wakeup is not lost. The restore will also set ++ * p::saved_state to TASK_RUNNING so any further tests will ++ * not result in false positives vs. @success ++ */ ++ if (p->saved_state & state) { ++ p->saved_state = TASK_RUNNING; ++ *success = 1; ++ } ++#endif + return false; + } + |