From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Tue, 12 Mar 2024 19:01:52 +0100 Subject: [PATCH 4/4] perf: Split __perf_pending_irq() out of perf_pending_irq() perf_pending_irq() invokes perf_event_wakeup() and __perf_pending_irq(). The former is in charge of waking any tasks which wait to be woken up while the latter disables perf-events. The irq_work perf_pending_irq(), while this an irq_work, the callback is invoked in thread context on PREEMPT_RT. This is needed because all the waking functions (wake_up_all(), kill_fasync()) acquire sleep locks which must not be used with disabled interrupts. Disabling events, as done by __perf_pending_irq(), expects a hardirq context and disabled interrupts. This requirement is not fulfilled on PREEMPT_RT. Split functionality based on perf_event::pending_disable into irq_work named `pending_disable_irq' and invoke it in hardirq context on PREEMPT_RT. Rename the split out callback to perf_pending_disable(). Tested-by: Marco Elver <elver@google.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com> Link: https://lore.kernel.org/r/20240312180814.3373778-5-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> --- include/linux/perf_event.h | 1 + kernel/events/core.c | 31 +++++++++++++++++++++++-------- 2 files changed, 24 insertions(+), 8 deletions(-) @ include/linux/perf_event.h:786 @ struct perf_event { unsigned int pending_disable; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; + struct irq_work pending_disable_irq; struct callback_head pending_task; unsigned int pending_work; --- a/kernel/events/core.c +++ b/kernel/events/core.c @ include/linux/perf_event.h:2452 @ static void __perf_event_disable(struct * hold the top-level event's child_mutex, so any descendant that * goes to exit will block in perf_event_exit_event(). * - * When called from perf_pending_irq it's OK because event->ctx + * When called from perf_pending_disable it's OK because event->ctx * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ @ include/linux/perf_event.h:2492 @ EXPORT_SYMBOL_GPL(perf_event_disable); void perf_event_disable_inatomic(struct perf_event *event) { event->pending_disable = 1; - irq_work_queue(&event->pending_irq); + irq_work_queue(&event->pending_disable_irq); } #define MAX_INTERRUPTS (~0ULL) @ include/linux/perf_event.h:5178 @ static void perf_addr_filters_splice(str static void _free_event(struct perf_event *event) { irq_work_sync(&event->pending_irq); + irq_work_sync(&event->pending_disable_irq); unaccount_event(event); @ include/linux/perf_event.h:6715 @ static void perf_sigtrap(struct perf_eve /* * Deliver the pending work in-event-context or follow the context. */ -static void __perf_pending_irq(struct perf_event *event) +static void __perf_pending_disable(struct perf_event *event) { int cpu = READ_ONCE(event->oncpu); @ include/linux/perf_event.h:6753 @ static void __perf_pending_irq(struct pe * irq_work_queue(); // FAILS * * irq_work_run() - * perf_pending_irq() + * perf_pending_disable() * * But the event runs on CPU-B and wants disabling there. */ - irq_work_queue_on(&event->pending_irq, cpu); + irq_work_queue_on(&event->pending_disable_irq, cpu); +} + +static void perf_pending_disable(struct irq_work *entry) +{ + struct perf_event *event = container_of(entry, struct perf_event, pending_disable_irq); + int rctx; + + /* + * If we 'fail' here, that's OK, it means recursion is already disabled + * and we won't recurse 'further'. + */ + rctx = perf_swevent_get_recursion_context(); + __perf_pending_disable(event); + if (rctx >= 0) + perf_swevent_put_recursion_context(rctx); } static void perf_pending_irq(struct irq_work *entry) @ include/linux/perf_event.h:6795 @ static void perf_pending_irq(struct irq_ perf_event_wakeup(event); } - __perf_pending_irq(event); - if (rctx >= 0) perf_swevent_put_recursion_context(rctx); } @ include/linux/perf_event.h:9585 @ static int __perf_event_overflow(struct * is processed. */ if (in_nmi()) - irq_work_queue(&event->pending_irq); + irq_work_queue(&event->pending_disable_irq); } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without @ include/linux/perf_event.h:11925 @ perf_event_alloc(struct perf_event_attr init_waitqueue_head(&event->waitq); init_irq_work(&event->pending_irq, perf_pending_irq); + event->pending_disable_irq = IRQ_WORK_INIT_HARD(perf_pending_disable); init_task_work(&event->pending_task, perf_pending_task); mutex_init(&event->mmap_mutex);