From 0a3061d9c53ef3e251a4dedbe25909babd64e8aa Mon Sep 17 00:00:00 2001
From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Date: Fri, 8 Sep 2023 18:22:52 +0200
Subject: [PATCH 005/204] locking/rtmutex: Use rt_mutex specific scheduler
 helpers

Have rt_mutex use the rt_mutex specific scheduler helpers to avoid
recursion vs rtlock on the PI state.

[[ peterz: adapted to new names ]]

Reported-by: Crystal Wood <swood@redhat.com>
Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lkml.kernel.org/r/20230908162254.999499-6-bigeasy@linutronix.de
---
 kernel/futex/pi.c            |   11 +++++++++++
 kernel/locking/rtmutex.c     |   14 ++++++++++++--
 kernel/locking/rwbase_rt.c   |    6 ++++++
 kernel/locking/rwsem.c       |    8 +++++++-
 kernel/locking/spinlock_rt.c |    4 ++++
 5 files changed, 40 insertions(+), 3 deletions(-)

Index: linux-6.6.58-rt45/kernel/futex/pi.c
===================================================================
@ linux-6.6.58-rt45/kernel/futex/pi.c:4 @
 // SPDX-License-Identifier: GPL-2.0-or-later
 
 #include <linux/slab.h>
+#include <linux/sched/rt.h>
 #include <linux/sched/task.h>
 
 #include "futex.h"
@ linux-6.6.58-rt45/kernel/futex/pi.c:1006 @ retry_private:
 		goto no_block;
 	}
 
+	/*
+	 * Must be done before we enqueue the waiter, here is unfortunately
+	 * under the hb lock, but that *should* work because it does nothing.
+	 */
+	rt_mutex_pre_schedule();
+
 	rt_mutex_init_waiter(&rt_waiter);
 
 	/*
@ linux-6.6.58-rt45/kernel/futex/pi.c:1062 @ cleanup:
 	if (ret && !rt_mutex_cleanup_proxy_lock(&q.pi_state->pi_mutex, &rt_waiter))
 		ret = 0;
 
+	/*
+	 * Waiter is unqueued.
+	 */
+	rt_mutex_post_schedule();
 no_block:
 	/*
 	 * Fixup the pi_state owner and possibly acquire the lock if we
Index: linux-6.6.58-rt45/kernel/locking/rtmutex.c
===================================================================
--- linux-6.6.58-rt45.orig/kernel/locking/rtmutex.c
+++ linux-6.6.58-rt45/kernel/locking/rtmutex.c
@ linux-6.6.58-rt45/kernel/futex/pi.c:1635 @ static int __sched rt_mutex_slowlock_blo
 		raw_spin_unlock_irq(&lock->wait_lock);
 
 		if (!owner || !rtmutex_spin_on_owner(lock, waiter, owner))
-			schedule();
+			rt_mutex_schedule();
 
 		raw_spin_lock_irq(&lock->wait_lock);
 		set_current_state(state);
@ linux-6.6.58-rt45/kernel/futex/pi.c:1665 @ static void __sched rt_mutex_handle_dead
 
 	while (1) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		schedule();
+		rt_mutex_schedule();
 	}
 }
 
@ linux-6.6.58-rt45/kernel/futex/pi.c:1761 @ static int __sched rt_mutex_slowlock(str
 	int ret;
 
 	/*
+	 * Do all pre-schedule work here, before we queue a waiter and invoke
+	 * PI -- any such work that trips on rtlock (PREEMPT_RT spinlock) would
+	 * otherwise recurse back into task_blocks_on_rt_mutex() through
+	 * rtlock_slowlock() and will then enqueue a second waiter for this
+	 * same task and things get really confusing real fast.
+	 */
+	rt_mutex_pre_schedule();
+
+	/*
 	 * Technically we could use raw_spin_[un]lock_irq() here, but this can
 	 * be called in early boot if the cmpxchg() fast path is disabled
 	 * (debug, no architecture support). In this case we will acquire the
@ linux-6.6.58-rt45/kernel/futex/pi.c:1780 @ static int __sched rt_mutex_slowlock(str
 	raw_spin_lock_irqsave(&lock->wait_lock, flags);
 	ret = __rt_mutex_slowlock_locked(lock, ww_ctx, state);
 	raw_spin_unlock_irqrestore(&lock->wait_lock, flags);
+	rt_mutex_post_schedule();
 
 	return ret;
 }
Index: linux-6.6.58-rt45/kernel/locking/rwbase_rt.c
===================================================================
--- linux-6.6.58-rt45.orig/kernel/locking/rwbase_rt.c
+++ linux-6.6.58-rt45/kernel/locking/rwbase_rt.c
@ linux-6.6.58-rt45/kernel/futex/pi.c:74 @ static int __sched __rwbase_read_lock(st
 	struct rt_mutex_base *rtm = &rwb->rtmutex;
 	int ret;
 
+	rwbase_pre_schedule();
 	raw_spin_lock_irq(&rtm->wait_lock);
 
 	/*
@ linux-6.6.58-rt45/kernel/futex/pi.c:129 @ static int __sched __rwbase_read_lock(st
 		rwbase_rtmutex_unlock(rtm);
 
 	trace_contention_end(rwb, ret);
+	rwbase_post_schedule();
 	return ret;
 }
 
@ linux-6.6.58-rt45/kernel/futex/pi.c:242 @ static int __sched rwbase_write_lock(str
 	/* Force readers into slow path */
 	atomic_sub(READER_BIAS, &rwb->readers);
 
+	rwbase_pre_schedule();
+
 	raw_spin_lock_irqsave(&rtm->wait_lock, flags);
 	if (__rwbase_write_trylock(rwb))
 		goto out_unlock;
@ linux-6.6.58-rt45/kernel/futex/pi.c:255 @ static int __sched rwbase_write_lock(str
 		if (rwbase_signal_pending_state(state, current)) {
 			rwbase_restore_current_state();
 			__rwbase_write_unlock(rwb, 0, flags);
+			rwbase_post_schedule();
 			trace_contention_end(rwb, -EINTR);
 			return -EINTR;
 		}
@ linux-6.6.58-rt45/kernel/futex/pi.c:274 @ static int __sched rwbase_write_lock(str
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&rtm->wait_lock, flags);
+	rwbase_post_schedule();
 	return 0;
 }
 
Index: linux-6.6.58-rt45/kernel/locking/rwsem.c
===================================================================
--- linux-6.6.58-rt45.orig/kernel/locking/rwsem.c
+++ linux-6.6.58-rt45/kernel/locking/rwsem.c
@ linux-6.6.58-rt45/kernel/futex/pi.c:1430 @ static inline void __downgrade_write(str
 #define rwbase_signal_pending_state(state, current)	\
 	signal_pending_state(state, current)
 
+#define rwbase_pre_schedule()				\
+	rt_mutex_pre_schedule()
+
 #define rwbase_schedule()				\
-	schedule()
+	rt_mutex_schedule()
+
+#define rwbase_post_schedule()				\
+	rt_mutex_post_schedule()
 
 #include "rwbase_rt.c"
 
Index: linux-6.6.58-rt45/kernel/locking/spinlock_rt.c
===================================================================
--- linux-6.6.58-rt45.orig/kernel/locking/spinlock_rt.c
+++ linux-6.6.58-rt45/kernel/locking/spinlock_rt.c
@ linux-6.6.58-rt45/kernel/futex/pi.c:187 @ static __always_inline int  rwbase_rtmut
 
 #define rwbase_signal_pending_state(state, current)	(0)
 
+#define rwbase_pre_schedule()
+
 #define rwbase_schedule()				\
 	schedule_rtlock()
 
+#define rwbase_post_schedule()
+
 #include "rwbase_rt.c"
 /*
  * The common functions which get wrapped into the rwlock API.