From: Sebastian Andrzej Siewior <bigeasy@linutronix.de> Date: Tue, 12 Mar 2024 19:01:50 +0100 Subject: [PATCH 2/4] perf: Enqueue SIGTRAP always via task_work. A signal is delivered by raising irq_work() which works from any context including NMI. irq_work() can be delayed if the architecture does not provide an interrupt vector. In order not to lose a signal, the signal is injected via task_work during event_sched_out(). Instead going via irq_work, the signal could be added directly via task_work. The signal is sent to current and can be enqueued on its return path to userland instead of triggering irq_work. A dummy IRQ is required in the NMI case to ensure the task_work is handled before returning to user land. For this irq_work is used. An alternative would be just raising an interrupt like arch_send_call_function_single_ipi(). During testing with `remove_on_exec' it become visible that the event can be enqueued via NMI during execve(). The task_work must not be kept because free_event() will complain later. Also the new task will not have a sighandler installed. Queue signal via task_work. Remove perf_event::pending_sigtrap and and use perf_event::pending_work instead. Raise irq_work in the NMI case for a dummy interrupt. Remove the task_work if the event is freed. Tested-by: Marco Elver <elver@google.com> Tested-by: Arnaldo Carvalho de Melo <acme@redhat.com> Reported-by: Arnaldo Carvalho de Melo <acme@redhat.com> Link: https://lore.kernel.org/r/20240312180814.3373778-3-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de> --- include/linux/perf_event.h | 3 -- kernel/events/core.c | 57 +++++++++++++++++++++++++-------------------- 2 files changed, 33 insertions(+), 27 deletions(-) Index: linux-6.8.2-rt10/include/linux/perf_event.h =================================================================== @ linux-6.8.2-rt10/include/linux/perf_event.h:784 @ struct perf_event { unsigned int pending_wakeup; unsigned int pending_kill; unsigned int pending_disable; - unsigned int pending_sigtrap; unsigned long pending_addr; /* SIGTRAP */ struct irq_work pending_irq; struct callback_head pending_task; @ linux-6.8.2-rt10/include/linux/perf_event.h:961 @ struct perf_event_context { struct rcu_head rcu_head; /* - * Sum (event->pending_sigtrap + event->pending_work) + * Sum (event->pending_work + event->pending_work) * * The SIGTRAP is targeted at ctx->task, as such it won't do changing * that until the signal is delivered. Index: linux-6.8.2-rt10/kernel/events/core.c =================================================================== --- linux-6.8.2-rt10.orig/kernel/events/core.c +++ linux-6.8.2-rt10/kernel/events/core.c @ linux-6.8.2-rt10/include/linux/perf_event.h:2286 @ event_sched_out(struct perf_event *event state = PERF_EVENT_STATE_OFF; } - if (event->pending_sigtrap) { - bool dec = true; - - event->pending_sigtrap = 0; - if (state != PERF_EVENT_STATE_OFF && - !event->pending_work) { - event->pending_work = 1; - dec = false; - WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); - task_work_add(current, &event->pending_task, TWA_RESUME); - } - if (dec) - local_dec(&event->ctx->nr_pending); - } - perf_event_set_state(event, state); if (!is_software_event(event)) @ linux-6.8.2-rt10/include/linux/perf_event.h:6729 @ static void __perf_pending_irq(struct pe * Yay, we hit home and are in the context of the event. */ if (cpu == smp_processor_id()) { - if (event->pending_sigtrap) { - event->pending_sigtrap = 0; - perf_sigtrap(event); - local_dec(&event->ctx->nr_pending); - } if (event->pending_disable) { event->pending_disable = 0; perf_event_disable_local(event); @ linux-6.8.2-rt10/include/linux/perf_event.h:9575 @ static int __perf_event_overflow(struct if (regs) pending_id = hash32_ptr((void *)instruction_pointer(regs)) ?: 1; - if (!event->pending_sigtrap) { - event->pending_sigtrap = pending_id; + if (!event->pending_work) { + event->pending_work = pending_id; local_inc(&event->ctx->nr_pending); - irq_work_queue(&event->pending_irq); + WARN_ON_ONCE(!atomic_long_inc_not_zero(&event->refcount)); + task_work_add(current, &event->pending_task, TWA_RESUME); + /* + * The NMI path returns directly to userland. The + * irq_work is raised as a dummy interrupt to ensure + * regular return path to user is taken and task_work + * is processed. + */ + if (in_nmi()) + irq_work_queue(&event->pending_irq); } else if (event->attr.exclude_kernel && valid_sample) { /* * Should not be able to return to user space without - * consuming pending_sigtrap; with exceptions: + * consuming pending_work; with exceptions: * * 1. Where !exclude_kernel, events can overflow again * in the kernel without returning to user space. @ linux-6.8.2-rt10/include/linux/perf_event.h:9601 @ static int __perf_event_overflow(struct * To approximate progress (with false negatives), * check 32-bit hash of the current IP. */ - WARN_ON_ONCE(event->pending_sigtrap != pending_id); + WARN_ON_ONCE(event->pending_work != pending_id); } event->pending_addr = 0; @ linux-6.8.2-rt10/include/linux/perf_event.h:13041 @ static void sync_child_event(struct perf &parent_event->child_total_time_running); } +static bool task_work_cb_match(struct callback_head *cb, void *data) +{ + struct perf_event *event = container_of(cb, struct perf_event, pending_task); + + return event == data; +} + static void perf_event_exit_event(struct perf_event *event, struct perf_event_context *ctx) { @ linux-6.8.2-rt10/include/linux/perf_event.h:13087 @ perf_event_exit_event(struct perf_event * Kick perf_poll() for is_event_hup(); */ perf_event_wakeup(parent_event); + /* + * Cancel pending task_work and update counters if it has not + * yet been delivered to userland. free_event() expects the + * reference counter at 1 and keeping the event around until the + * task return to userland will be a unexpected. + */ + if (event->pending_work && + task_work_cancel_match(current, task_work_cb_match, event)) { + put_event(event); + local_dec(&event->ctx->nr_pending); + } free_event(event); put_event(parent_event); return;