From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Thu, 27 Apr 2023 13:19:35 +0200 Subject: [PATCH 2/4] locking/rtmutex: Submit/resume work explicitly before/after blocking schedule() invokes sched_submit_work() before scheduling and sched_resume_work() afterwards to ensure that queued block requests are flushed and the (IO)worker machineries can instantiate new workers if required. This avoids deadlocks and starvation. With rt_mutexes this can lead to a subtle problem: When rtmutex blocks current::pi_blocked_on points to the rtmutex it blocks on. When one of the functions in sched_submit/resume_work() contends on a rtmutex based lock then that would corrupt current::pi_blocked_on. Let rtmutex and the RT lock variants which are based on it invoke sched_submit/resume_work() explicitly before and after the slowpath so it's guaranteed that current::pi_blocked_on cannot be corrupted by blocking on two locks. This does not apply to the PREEMPT_RT variants of spinlock_t and rwlock_t as their scheduling slowpath is separate and cannot invoke the work related functions due to potential deadlocks anyway. [ tglx: Make it explicit and symmetric. Massage changelog ] Fixes: e17ba59b7e8e1 ("locking/rtmutex: Guard regular sleeping locks specific functions") Reported-by: Crystal Wood <swood@redhat.com> Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Link: https://lore.kernel.org/4b4ab374d3e24e6ea8df5cadc4297619a6d945af.camel@redhat.com Link: https://lore.kernel.org/r/20230427111937.2745231-3-bigeasy@linutronix.de --- kernel/locking/rtmutex.c | 11 +++++++++-- kernel/locking/rwbase_rt.c | 18 ++++++++++++++++-- kernel/locking/rwsem.c | 6 ++++++ kernel/locking/spinlock_rt.c | 3 +++ 4 files changed, 34 insertions(+), 4 deletions(-) Index: linux-6.3.0-rt11/kernel/locking/rtmutex.c =================================================================== @ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1558 @ static int __sched rt_mutex_slowlock_blo raw_spin_unlock_irq(&lock->wait_lock); if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner)) - schedule(); + schedule_rtmutex(); raw_spin_lock_irq(&lock->wait_lock); set_current_state(state); @ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1587 @ static void __sched rt_mutex_handle_dead WARN(1, "rtmutex deadlock detected\n"); while (1) { set_current_state(TASK_INTERRUPTIBLE); - schedule(); + schedule_rtmutex(); } } @ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1683 @ static int __sched rt_mutex_slowlock(str int ret; /* + * The task is about to sleep. Invoke sched_submit_work() before + * blocking as that might take locks and corrupt tsk::pi_blocked_on. + */ + sched_submit_work(); + + /* * Technically we could use raw_spin_[un]lock_irq() here, but this can * be called in early boot if the cmpxchg() fast path is disabled * (debug, no architecture support). In this case we will acquire the @ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1700 @ static int __sched rt_mutex_slowlock(str ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + sched_resume_work(); return ret; } Index: linux-6.3.0-rt11/kernel/locking/rwbase_rt.c =================================================================== --- linux-6.3.0-rt11.orig/kernel/locking/rwbase_rt.c +++ linux-6.3.0-rt11/kernel/locking/rwbase_rt.c @ linux-6.3.0-rt11/kernel/locking/rtmutex.c:134 @ static int __sched __rwbase_read_lock(st static __always_inline int rwbase_read_lock(struct rwbase_rt *rwb, unsigned int state) { + int ret; + if (rwbase_read_trylock(rwb)) return 0; - return __rwbase_read_lock(rwb, state); + /* + * The task is about to sleep. For rwsems this submits work as that + * might take locks and corrupt tsk::pi_blocked_on. Must be + * explicit here because __rwbase_read_lock() cannot invoke + * rt_mutex_slowlock(). NOP for rwlocks. + */ + rwbase_sched_submit_work(); + ret = __rwbase_read_lock(rwb, state); + rwbase_sched_resume_work(); + return ret; } static void __sched __rwbase_read_unlock(struct rwbase_rt *rwb, @ linux-6.3.0-rt11/kernel/locking/rtmutex.c:244 @ static int __sched rwbase_write_lock(str struct rt_mutex_base *rtm = &rwb->rtmutex; unsigned long flags; - /* Take the rtmutex as a first step */ + /* + * Take the rtmutex as a first step. For rwsem this will also + * invoke sched_submit_work() to flush IO and workers. + */ if (rwbase_rtmutex_lock_state(rtm, state)) return -EINTR; Index: linux-6.3.0-rt11/kernel/locking/rwsem.c =================================================================== --- linux-6.3.0-rt11.orig/kernel/locking/rwsem.c +++ linux-6.3.0-rt11/kernel/locking/rwsem.c @ linux-6.3.0-rt11/kernel/locking/rtmutex.c:1418 @ static inline void __downgrade_write(str #define rwbase_rtmutex_lock_state(rtm, state) \ __rt_mutex_lock(rtm, state) +#define rwbase_sched_submit_work() \ + sched_submit_work() + +#define rwbase_sched_resume_work() \ + sched_resume_work() + #define rwbase_rtmutex_slowlock_locked(rtm, state) \ __rt_mutex_slowlock_locked(rtm, NULL, state) Index: linux-6.3.0-rt11/kernel/locking/spinlock_rt.c =================================================================== --- linux-6.3.0-rt11.orig/kernel/locking/spinlock_rt.c +++ linux-6.3.0-rt11/kernel/locking/spinlock_rt.c @ linux-6.3.0-rt11/kernel/locking/rtmutex.c:162 @ rwbase_rtmutex_lock_state(struct rt_mute return 0; } +static __always_inline void rwbase_sched_submit_work(void) { } +static __always_inline void rwbase_sched_resume_work(void) { } + static __always_inline int rwbase_rtmutex_slowlock_locked(struct rt_mutex_base *rtm, unsigned int state) {