From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Thu, 27 Apr 2023 13:19:35 +0200
Subject: [PATCH 2/4] locking/rtmutex: Submit/resume work explicitly
 before/after blocking

schedule() invokes sched_submit_work() before scheduling and
sched_resume_work() afterwards to ensure that queued block requests are
flushed and the (IO)worker machineries can instantiate new workers if
required. This avoids deadlocks and starvation.

With rt_mutexes this can lead to a subtle problem:

  When rtmutex blocks current::pi_blocked_on points to the rtmutex it
  blocks on. When one of the functions in sched_submit/resume_work() contends
  on a rtmutex based lock then that would corrupt current::pi_blocked_on.

Let rtmutex and the RT lock variants which are based on it invoke
sched_submit/resume_work() explicitly before and after the slowpath so
it's guaranteed that current::pi_blocked_on cannot be corrupted by blocking
on two locks.

This does not apply to the PREEMPT_RT variants of spinlock_t and rwlock_t
as their scheduling slowpath is separate and cannot invoke the work related
functions due to potential deadlocks anyway.

[ tglx: Make it explicit and symmetric. Massage changelog ]

Fixes: e17ba59b7e8e1 ("locking/rtmutex: Guard regular sleeping locks specific functions")
Reported-by: Crystal Wood <swood@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Link: https://lore.kernel.org/4b4ab374d3e24e6ea8df5cadc4297619a6d945af.camel@redhat.com
Link: https://lore.kernel.org/r/20230427111937.2745231-3-bigeasy@linutronix.de
---
 kernel/locking/rtmutex.c     |   11 +++++++++--
 kernel/locking/rwbase_rt.c   |   18 ++++++++++++++++--
 kernel/locking/rwsem.c       |    6 ++++++
 kernel/locking/spinlock_rt.c |    3 +++
 4 files changed, 34 insertions(+), 4 deletions(-)

Index: linux-6.3.0-rt11/kernel/locking/rtmutex.c
===================================================================
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1558 @ static int __sched rt_mutex_slowlock_blo
 		raw_spin_unlock_irq(&lock->wait_lock);
 
 		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
-			schedule();
+			schedule_rtmutex();
 
 		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1587 @ static void __sched rt_mutex_handle_dead
 	WARN(1, "rtmutex deadlock detected\n");
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule();
+		schedule_rtmutex();
 	}
 }
 
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1683 @ static int __sched rt_mutex_slowlock(str
 	int ret;
 
 	/*
+	 * The task is about to sleep. Invoke sched_submit_work() before
+	 * blocking as that might take locks and corrupt tsk::pi_blocked_on.
+	 */
+	sched_submit_work();
+
+	/*
 	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
 	 * be called in early boot if the cmpxchg() fast path is disabled
 	 * (debug, no architecture support). In this case we will acquire the
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1700 @ static int __sched rt_mutex_slowlock(str
 	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
 
+	sched_resume_work();
 	return ret;
 }
 
Index: linux-6.3.0-rt11/kernel/locking/rwbase_rt.c
===================================================================
--- linux-6.3.0-rt11.orig/kernel/locking/rwbase_rt.c
+++ linux-6.3.0-rt11/kernel/locking/rwbase_rt.c
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:134 @ static int __sched __rwbase_read_lock(st
 static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb,
 					    unsigned int state)
 {
+	int ret;
+
 	if (rwbase_read_trylock(rwb))
 		return 0;
 
-	return __rwbase_read_lock(rwb, state);
+	/*
+	 * The task is about to sleep. For rwsems this submits work as that
+	 * might take locks and corrupt tsk::pi_blocked_on. Must be
+	 * explicit here because __rwbase_read_lock() cannot invoke
+	 * rt_mutex_slowlock(). NOP for rwlocks.
+	 */
+	rwbase_sched_submit_work();
+	ret = __rwbase_read_lock(rwb, state);
+	rwbase_sched_resume_work();
+	return ret;
 }
 
 static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb,
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:244 @ static int __sched rwbase_write_lock(str
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 	unsigned long flags;
 
-	/* Take the rtmutex as a first step */
+	/*
+	 * Take the rtmutex as a first step. For rwsem this will also
+	 * invoke sched_submit_work() to flush IO and workers.
+	 */
 	if (rwbase_rtmutex_lock_state(rtm, state))
 		return -EINTR;
 
Index: linux-6.3.0-rt11/kernel/locking/rwsem.c
===================================================================
--- linux-6.3.0-rt11.orig/kernel/locking/rwsem.c
+++ linux-6.3.0-rt11/kernel/locking/rwsem.c
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1418 @ static inline void __downgrade_write(str
 #define rwbase_rtmutex_lock_state(rtm, state)		\
 	__rt_mutex_lock(rtm, state)
 
+#define rwbase_sched_submit_work()			\
+	sched_submit_work()
+
+#define rwbase_sched_resume_work()			\
+	sched_resume_work()
+
 #define rwbase_rtmutex_slowlock_locked(rtm, state)	\
 	__rt_mutex_slowlock_locked(rtm, NULL, state)
 
Index: linux-6.3.0-rt11/kernel/locking/spinlock_rt.c
===================================================================
--- linux-6.3.0-rt11.orig/kernel/locking/spinlock_rt.c
+++ linux-6.3.0-rt11/kernel/locking/spinlock_rt.c
@ linux-6.3.0-rt11/kernel/locking/rtmutex.c:162 @ rwbase_rtmutex_lock_state(struct rt_mute
 	return 0;
 }
 
+static __always_inline void rwbase_sched_submit_work(void) { }
+static __always_inline void rwbase_sched_resume_work(void) { }
+
 static __always_inline int
 rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state)
 {