1 files changed, 196 insertions, 0 deletions
diff --git a/patches/0006-sched-wakeup-Prepare-for-RT-sleeping-spin-rwlocks.patch b/patches/0006-sched-wakeup-Prepare-for-RT-sleeping-spin-rwlocks.patch
new file mode 100644
index 000000000000..9877dc34acb5
--- /dev/null
+++ b/patches/0006-sched-wakeup-Prepare-for-RT-sleeping-spin-rwlocks.patch
@@ -0,0 +1,196 @@
+From: Thomas Gleixner <tglx@linutronix.de>
+Date: Sun, 15 Aug 2021 23:27:44 +0200
+Subject: [PATCH 06/72] sched/wakeup: Prepare for RT sleeping spin/rwlocks
+
+Waiting for spinlocks and rwlocks on non RT enabled kernels is task::state
+preserving. Any wakeup which matches the state is valid.
+
+RT enabled kernels substitutes them with 'sleeping' spinlocks. This creates
+an issue vs. task::__state.
+
+In order to block on the lock, the task has to overwrite task::__state and a
+consecutive wakeup issued by the unlocker sets the state back to
+TASK_RUNNING. As a consequence the task loses the state which was set
+before the lock acquire and also any regular wakeup targeted at the task
+while it is blocked on the lock.
+
+To handle this gracefully, add a 'saved_state' member to task_struct which
+is used in the following way:
+
+ 1) When a task blocks on a 'sleeping' spinlock, the current state is saved
+    in task::saved_state before it is set to TASK_RTLOCK_WAIT.
+
+ 2) When the task unblocks and after acquiring the lock, it restores the saved
+    state.
+
+ 3) When a regular wakeup happens for a task while it is blocked then the
+    state change of that wakeup is redirected to operate on task::saved_state.
+
+    This is also required when the task state is running because the task
+    might have been woken up from the lock wait and has not yet restored
+    the saved state.
+
+To make it complete, provide the necessary helpers to save and restore the
+saved state along with the necessary documentation how the RT lock blocking
+is supposed to work.
+
+For non-RT kernels there is no functional change.
+
+Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Ingo Molnar <mingo@kernel.org>
+Link: https://lore.kernel.org/r/20210815211302.258751046@linutronix.de
+---
+ include/linux/sched.h |   66 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ kernel/sched/core.c   |   33 +++++++++++++++++++++++++
+ 2 files changed, 99 insertions(+)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -143,9 +143,22 @@ struct task_group;
+ 		current->task_state_change = _THIS_IP_;			\
+ 	} while (0)
+ 
++# define debug_rtlock_wait_set_state()					\
++	do {								 \
++		current->saved_state_change = current->task_state_change;\
++		current->task_state_change = _THIS_IP_;			 \
++	} while (0)
++
++# define debug_rtlock_wait_restore_state()				\
++	do {								 \
++		current->task_state_change = current->saved_state_change;\
++	} while (0)
++
+ #else
+ # define debug_normal_state_change(cond)	do { } while (0)
+ # define debug_special_state_change(cond)	do { } while (0)
++# define debug_rtlock_wait_set_state()		do { } while (0)
++# define debug_rtlock_wait_restore_state()	do { } while (0)
+ #endif
+ 
+ /*
+@@ -213,6 +226,51 @@ struct task_group;
+ 		raw_spin_unlock_irqrestore(&current->pi_lock, flags);	\
+ 	} while (0)
+ 
++/*
++ * PREEMPT_RT specific variants for "sleeping" spin/rwlocks
++ *
++ * RT's spin/rwlock substitutions are state preserving. The state of the
++ * task when blocking on the lock is saved in task_struct::saved_state and
++ * restored after the lock has been acquired.  These operations are
++ * serialized by task_struct::pi_lock against try_to_wake_up(). Any non RT
++ * lock related wakeups while the task is blocked on the lock are
++ * redirected to operate on task_struct::saved_state to ensure that these
++ * are not dropped. On restore task_struct::saved_state is set to
++ * TASK_RUNNING so any wakeup attempt redirected to saved_state will fail.
++ *
++ * The lock operation looks like this:
++ *
++ *	current_save_and_set_rtlock_wait_state();
++ *	for (;;) {
++ *		if (try_lock())
++ *			break;
++ *		raw_spin_unlock_irq(&lock->wait_lock);
++ *		schedule_rtlock();
++ *		raw_spin_lock_irq(&lock->wait_lock);
++ *		set_current_state(TASK_RTLOCK_WAIT);
++ *	}
++ *	current_restore_rtlock_saved_state();
++ */
++#define current_save_and_set_rtlock_wait_state()			\
++	do {								\
++		lockdep_assert_irqs_disabled();				\
++		raw_spin_lock(&current->pi_lock);			\
++		current->saved_state = current->__state;		\
++		debug_rtlock_wait_set_state();				\
++		WRITE_ONCE(current->__state, TASK_RTLOCK_WAIT);		\
++		raw_spin_unlock(&current->pi_lock);			\
++	} while (0);
++
++#define current_restore_rtlock_saved_state()				\
++	do {								\
++		lockdep_assert_irqs_disabled();				\
++		raw_spin_lock(&current->pi_lock);			\
++		debug_rtlock_wait_restore_state();			\
++		WRITE_ONCE(current->__state, current->saved_state);	\
++		current->saved_state = TASK_RUNNING;			\
++		raw_spin_unlock(&current->pi_lock);			\
++	} while (0);
++
+ #define get_current_state()	READ_ONCE(current->__state)
+ 
+ /* Task command name length: */
+@@ -668,6 +726,11 @@ struct task_struct {
+ #endif
+ 	unsigned int			__state;
+ 
++#ifdef CONFIG_PREEMPT_RT
++	/* saved state for "spinlock sleepers" */
++	unsigned int			saved_state;
++#endif
++
+ 	/*
+ 	 * This begins the randomizable portion of task_struct. Only
+ 	 * scheduling-critical items should be added above here.
+@@ -1361,6 +1424,9 @@ struct task_struct {
+ 	struct kmap_ctrl		kmap_ctrl;
+ #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
+ 	unsigned long			task_state_change;
++# ifdef CONFIG_PREEMPT_RT
++	unsigned long			saved_state_change;
++# endif
+ #endif
+ 	int				pagefault_disabled;
+ #ifdef CONFIG_MMU
+--- a/kernel/sched/core.c
++++ b/kernel/sched/core.c
+@@ -3566,14 +3566,47 @@ static void ttwu_queue(struct task_struc
+  *
+  * The caller holds p::pi_lock if p != current or has preemption
+  * disabled when p == current.
++ *
++ * The rules of PREEMPT_RT saved_state:
++ *
++ *   The related locking code always holds p::pi_lock when updating
++ *   p::saved_state, which means the code is fully serialized in both cases.
++ *
++ *   The lock wait and lock wakeups happen via TASK_RTLOCK_WAIT. No other
++ *   bits set. This allows to distinguish all wakeup scenarios.
+  */
+ static __always_inline
+ bool ttwu_state_match(struct task_struct *p, unsigned int state, int *success)
+ {
++	if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)) {
++		WARN_ON_ONCE((state & TASK_RTLOCK_WAIT) &&
++			     state != TASK_RTLOCK_WAIT);
++	}
++
+ 	if (READ_ONCE(p->__state) & state) {
+ 		*success = 1;
+ 		return true;
+ 	}
++
++#ifdef CONFIG_PREEMPT_RT
++	/*
++	 * Saved state preserves the task state across blocking on
++	 * an RT lock.  If the state matches, set p::saved_state to
++	 * TASK_RUNNING, but do not wake the task because it waits
++	 * for a lock wakeup. Also indicate success because from
++	 * the regular waker's point of view this has succeeded.
++	 *
++	 * After acquiring the lock the task will restore p::__state
++	 * from p::saved_state which ensures that the regular
++	 * wakeup is not lost. The restore will also set
++	 * p::saved_state to TASK_RUNNING so any further tests will
++	 * not result in false positives vs. @success
++	 */
++	if (p->saved_state & state) {
++		p->saved_state = TASK_RUNNING;
++		*success = 1;
++	}
++#endif
+ 	return false;
+ }
+